def ll2s(x): """ converts a list of lists, e.g. [['guid1',2],['guid2',0]] into a set {'guid1','guid2'} """ neighbour_set = set() for neighbour in x: neighbour_set.add(neighbour[0]) return neighbour_set # define directory where the fastas are fastadir = os.path.join('..', 'demos', 'AA041', 'fasta') outputdir = os.path.join('..', 'demos', 'AA041', 'output') # instantiate client fn3c = fn3Client( ) # expects operation on local host; pass baseurl if somewhere else. # names of the clustering algorithms clusters = fn3c.clustering() existing_guids = set(fn3c.guids()) clustering_created = False print("There are {0} existing guids".format(len(existing_guids))) # add control fasta files. The system evaluates the %N in terms of the population existing # we load 50 randomly selected guids as controls for i, fastafile in enumerate( glob.glob(os.path.join(fastadir, 'control', '*.fasta'))): guid = "ctrl_" + os.path.basename(fastafile).replace('.fasta', '') seq = fn3c.read_fasta_file(fastafile)['seq'] if not guid in existing_guids:
p.mkdir(parents=True, exist_ok=True) p = pathlib.Path(inputdir) p.mkdir(parents=True, exist_ok=True) # determine input files inputfiles = glob.glob(os.path.join(inputdir, '*.fasta')) random.shuffle(inputfiles) # read them in order if len(inputfiles) < max_sequences: raise ValueError( "Asked to add {0} sequences, but only {1} are available in the input directory {2}" .format(max_sequences, len(inputfiles), inputdir)) else: inputfiles = inputfiles[0:max_sequences] print("opening connection to fn3 server") fn3c = fn3Client(baseurl="http://127.0.0.1:5020") # determine all masked positions excluded_positions = fn3c.nucleotides_excluded() # determine how many samples there are currently in the server. nSamples = len(fn3c.guids()) print("There are {0} existing samples. Adding more ..".format(nSamples)) # create output file with header line outputfile = os.path.join(outputdir, 'timings_{0}.tsv'.format(nSamples)) nAdded_this_batch = 0 with open(outputfile, 'w+t') as f: output_line = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( 'nSamples', 's_insert', 'e_insert', 'd_insert', 's_read', 'e_read', 'd_read')
Example usage: # first, a server must be running python findNeighbour3-server.py ../demos/simulation/config/config.json # then simulations must be generated (e.g. with run_simulation) python run_simulation.py ../output/simulation_set_1""") parser.add_argument('inputdir', type=str, nargs=1, help='data will be read from the inputdir') args = parser.parse_args() basedir = os.path.abspath(args.inputdir[0]) # connect to server fn3c = fn3Client("http://localhost:5020") # iterate over simulated data for inputdir in glob.glob(os.path.join(basedir, '*')): print(inputdir) # define filenames fasta_filename = os.path.join(inputdir, 'phylogeny.fasta') sequence_filename = os.path.join(inputdir, 'phylogeny.txt') observed_filename = os.path.join(inputdir, 'observed.txt') tree_filename = os.path.join(inputdir, 'phylogeny.nwk') ref_filename = os.path.join(inputdir, 'reference.fasta') treepic_filename = os.path.join(inputdir, '{0}.png'.format('tree_image')) annotated_treepic_filename = os.path.join( inputdir, '{0}.png'.format('annotated_tree_image'))
def __init__(self): self.fn3c = fn3Client() # expect success