def sampleAllSpecies(speciesFile='./species_list.txt',sample_directions=['upstream','downstream'], dataPath_out='../data/sample_seqs/fasta/', sample_range=2000, take_annotated_utr=False, masking='hard'): """ Notes: samples sequences up/downstream from each gene of each genome of a list of species generating fasta files Args: species_list = ['Anopheles gambiae', 'Aedes aegypti'] #list: species names to sample from sample_directions = ['upstream','downstream'] #list: list of directions to sample seqs from Dependencies: --> data/sample_seqs/fasta/<species_name>_<up/down>stream.fasta --> meme_dataPrepper.py meme_bgfileGen.py <-- speciesManage.generate_list.py sampleSequences_write() <-- sampleSequences_read() <-- setupGenome() """ import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names for sample_direction in sample_directions: for species in species_list: print(species) genome = setupGenome( species, db_host='localhost', db_user='******', db_pass='******', db_release=73 ) samples_read = sampleSequences_read ( genome, sample_direction=sample_direction, sample_range=sample_range, take_annotated_utr=take_annotated_utr, masking = masking, sample_data={}, sample_seqs={}, test_it=False) samples_write = sampleSequences_write( genome, samples_read, fasta_it=True, pickle_it=True, include_genes=False) import pprint pprint.pprint(samples_write[1]) # show the header
def allSpecies( speciesFile = './species_list.txt', dataPath_in = '../data/meme_data/in/', dataPath_out='../data/meme_data/in/randomFasta/', n_replicates=5, n_seqs=2000 ): import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names for species in species_list: print(species) suffix_in = '_upstream_memeready_all_simpleMasked.fasta' filename_in = dataPath_in+species+suffix_in file_in = open(filename_in) headers = {} seqs = {} dataPath_species = dataPath_out+species+'/' if not os.path.exists(dataPath_species): os.makedirs(dataPath_species) # New dir for species else: print('\tSamples already AVAILABLE for: '+species) continue # IMPORT POPULATION FASTA print '\tImporting FASTA...' while True: header = file_in.readline() sequence= file_in.readline() if header == '': # break when finished break header_split = header.split('\t') geneId = header_split[0].replace('>','') headers[geneId] = header seqs[geneId] = sequence file_in.close() # RANDOM SAMPLING LIST if os.path.isfile(dataPath_species+'randomList_geneIds.p'): print('\tRandom Fasta: AVAILABLE -> Importing...') randList_geneIds= pickle.load(open(dataPath_species+'randomList_geneIds.p','rb')) else: print('\tRandom Fasta: UNAVAILABLE -> Generating...') randList = [[random.randrange(0,len(seqs.keys())-1) for seq in range(0,n_seqs)] for replicate in range(0,n_replicates)] randList_geneIds= [[headers.keys()[seq] for seq in randList[replicate]] for replicate in range(0,len(randList))] pickle.dump(randList_geneIds,open(dataPath_species+'randomList_geneIds.p','wb')) # GENERATE SAMPLE FASTA for i,replicate in enumerate(randList_geneIds): suffix_out = suffix_in.replace('_all_','_'+str(n_seqs)+'_'+str(i)+'_') # ^repl --> ^#seqs ^replicate id filename_out= dataPath_species+species+suffix_out file_out = open(filename_out,'w') for seq in replicate: h = headers[seq] s = seqs[seq] file_out.write(h+s) file_out.close()
def allSpecies(speciesFile, dataPath_in, dataPath_out, resultFormat, verbosity, eValue, nMotifs): """ Notes: Args: speciesFile = './species_list.txt' dataPath_in = '../data/meme_data/in/random_dreme/' dataPath_out = '../data/meme_data/out/' verbosity = 2 eValue = 0.0001 nMotifs = 100 resultFormat = 'png' hpc = True # is this run via HPC? """ suffix_in = "_upstream_dremeready_all_simpleMasked_random.fasta" suffix_out = "_100bp" species_list = speciesManage.generate_list( speciesFile ) # generates a list of species names corresponding to EnsEMBl MySQL db name for species in species_list: filename_in = dataPath_in + species + suffix_in dreme_out = dataPath_out + species + suffix_out # dreme -png -v 1 -oc . -t 18000 -p anopheles_gambiae_upstream_memeready_all_simpleMasked.fasta -e 0.05 -m 100 -dfile description dreme = [ "dreme", # DREME program "-" + resultFormat, # result format "-v", str(verbosity), # verbosity, 1:5 "-oc", dreme_out, # over-write directory with <name> and write in the results #'-t','18000', # time-limit "-p", filename_in, # positive file: input fasta data #'-n', ... , # negative file: the null model of fasta sequences, i.e. dinucleotide shuffling "-e", str(eValue), # E-value cut-off, ignores the list of motifs with < -e from running the long analysis on "-m", str(nMotifs), # n_motifs to search for, runtime scales linearly w/ acceleration ] # if hpc==True: # dreme = ['qsub']+dreme all_dreme = [dreme] for dreme in all_dreme: # start = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0] subprocess.call(dreme)
def allSpecies( speciesFile, motifsPath_in, fastaPath_in, dataPath_out, verbosity, threshold ): """ Notes: Args: speciesFile = './species_list.txt' motifsPath_in = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/' fastaPath_in = '../data/meme_data/in/random_dreme/' dataPath_out = '../data/meme_data/out/fimo/' verbosity = 2 # threshold = 0.1 # q-value cut-off for bad matches filter """ species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names if not os.path.exists(dataPath_out): os.makedirs(dataPath_out) suffix_in_motifs= '_100bp/dreme.html' suffix_in_fasta = '_upstream_dremeready_all_simpleMasked_random.fasta' suffix_out = '_100bp' for species in species_list: print(species) motifs_in = motifsPath_in+ species + suffix_in_motifs fasta_in = fastaPath_in + species + suffix_in_fasta fimo_out = dataPath_out + species + suffix_out # COMMAND LINE: fimo -oc fimo_100bp/ -thresh 0.1 -verbosity 2 ./dreme_100bp/sampled_all_hpc/anopheles_christyi_100bp/dreme.html ./dreme_100bp/sampled_all_hpc/anopheles_merus_100bp/dreme.html fimo = [ 'fimo', # fimo program '-verbosity', str(verbosity), # verbosity, 1:5 '-oc', fimo_out, # over-write directory with <name> and write in the results '-thresh', str(threshold), # significance threshold, q-value motifs_in, # positive file: input fasta data fasta_in ] all_fimo = [fimo] for fimo in all_fimo: #start = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0] subprocess.call(fimo)
def allSpecies( speciesFile, dataPath_in, dataPath_out, verbosity, threshold ): """ Notes: Args: speciesFile = './species_list.txt' dataPath_in = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/' dataPath_out = '../data/meme_data/out/tomtom_100bp/' verbosity = 2 # threshold = 0.1 # q-value cut-off for bad matches filter """ suffix_in = '_100bp/dreme.html' suffix_out = '_100bp' species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names speciesPairs = list(itertools.product(species_list,species_list)) for speciesPair in speciesPairs: print(speciesPair) species_query = speciesPair[0] species_database = speciesPair[1] filename_in_query = dataPath_in + species_query + suffix_in filename_in_database= dataPath_in + species_database + suffix_in tomtom_out = dataPath_out+species_query+'_vs_'+species_database+suffix_out # COMMAND LINE: tomtom -oc tomtom_100bp/ -thresh 0.1 -verbosity 2 ./dreme_100bp/sampled_all_hpc/anopheles_christyi_100bp/dreme.html ./dreme_100bp/sampled_all_hpc/anopheles_merus_100bp/dreme.html tomtom = [ 'tomtom', # Tomtom program '-verbosity', str(verbosity), # verbosity, 1:5 '-oc', tomtom_out, # over-write directory with <name> and write in the results '-thresh', str(threshold), # significance threshold, q-value filename_in_query, # positive file: input fasta data filename_in_database ] all_tomtom = [tomtom] for tomtom in all_tomtom: start = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0] subprocess.call(tomtom) end = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0] tdelta = datetime.strptime(end.split(' ')[3], '%H:%M:%S') - datetime.strptime(start.split(' ')[3], '%H:%M:%S') print(tdelta)
def allSpecies( dataPath_in, bfileGeneratorPath, speciesFile='./species_list.txt', maskingChar = 'n', order=3 ): """ Notes: Iterates meme_bfileGenerator for all species listed Dependencies: Executed: see meme_bfileGenerator.py Scripts: see meme_bfileGenerator.py Args: dataPath_in = '0VB/2kb/data/meme_data/in/' # path of directory with allSpecies memeready fasta files """ print('Generating bfiles for all species...') import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names for species in species_list: print(species) #file_in = dataPath_in+species.lower().replace(' ','_')+'_upstream_memeready_all_dusted.fasta' # ANDY 25_02 file_in = dataPath_in+species.lower().replace(' ','_')+'_upstream_memeready_all.fasta' # ANDY 25_02 file_out = dataPath_in+species.lower().replace(' ','_')+'.bg2' print file_out meme_bfileGenerator( file_in, file_out, bfileGeneratorPath, maskingChar, order )
def allSpecies(speciesFile, dataPath_in, dataPath_out, n_seqs=10000, len_seq=100, population=True): """ Notes: Args: speciesFile = './species_list.txt' # location of species config file dataPath_in = '../data/meme_data/in/' # input directiory dataPath_out='../data/meme_data/in/random_dreme/' # output directory n_replicates=5 # number of replicates per species n_seqs =10000 # 10k sampled seqs len_seq =100 # 100bp long sequence samples """ def rand_parts(seq, n, l): indices = xrange(len(seq) - (l - 1) * n) result = [] offset = 0 for i in sorted(random.sample(indices, n)): i += offset result.append(seq[i : i + l]) offset += l - 1 return result def chunks(l, n): """ Yield successive n-sized chunks from l. """ for i in xrange(0, len(l), n): yield l[i : i + n] sys.path.append(os.getcwd()) suffix_in = "_upstream_memeready_all_simpleMasked.fasta" # common filename component between al sp. suffix_out = suffix_in.split(".")[0].replace("memeready", "dremeready") + "_random" + "." + suffix_in.split(".")[1] species_list = speciesManage.generate_list( speciesFile ) # generates a list of species names corresponding to EnsEMBl MySQL db names for species in species_list: print (species) # IMPORT FASTA : filename_in = dataPath_in + species + suffix_in filename_out = dataPath_out + species + suffix_out file_in = open(filename_in) headers = {} # dict for geneId:header seqs = {} # geneId:seq print "\tImporting FASTA..." while True: header = file_in.readline() seq = file_in.readline() if header == "": break header_split = header.split("\t") geneId = header_split[0].replace(">", "") mask = re.compile(r"(N)\1{2,}") # collapse consecutive masking chars, NNN --> N seq_collaspedMasks = mask.sub("N", seq).rstrip() + "N" # ^ add X to delimit each sequence later when they are concated headers[geneId] = header seqs[geneId] = seq_collaspedMasks file_in.close() # META SEQUENCE : concatenate all sequences for later random sampling print ("\tconcatenating sequences to form meta-sequence...") meta_seq = "".join([seqs[i] for i in seqs.keys()]) meta_index = range(0, len(meta_seq)) # LOCATION:GENEID : map each bp location of the meta_sequence to geneId it belongs to, ~14mil long meta_seq becomes its own key:geneId print ("\tmapping meta-sequence to geneIds...") loc2GeneId = {} lens = [len(seqs[i]) for i in seqs.keys()] # lengths of each seq start = 0 for k, geneId in enumerate(seqs.keys()): start = start end = start + lens[k] seqRange = range(start, end) for loc in seqRange: loc2GeneId[loc] = geneId start = end # RANDOM NON_OVERLAPPING SAMPLING : sample N random subranges along the meta_seq, representing non-overlapping sub_seqs length L print ("\tgenerating random sub-sequence samples...") if population == False: randRanges = rand_parts(meta_index, n_seqs, len_seq) elif population == True: randRanges = chunks(meta_index, len_seq) randSamples = [] for randRange in randRanges: geneSet = list( set([loc2GeneId[loc] for loc in randRange]) ) # collect geneIds whose sequences lie in the randRange randSamples.append( { "geneId": geneSet, # for the given rangeRange take note of the geneIds within "seq": meta_seq[randRange[0] : randRange[-1]], # store the sub_seq corresponding to the randRange } ) # GENERATE FASTA print ("\twriting sampled sequences to FASTA...") if not os.path.exists(dataPath_out): # generate cluster fasta dirs print ("making directory: " + dataPath_out) os.makedirs(dataPath_out) file_out = open(filename_out, "w") for sample in randSamples: headers = sample["geneId"] seq = sample["seq"] file_out.write(">" + "\t".join(headers) + "\n") file_out.write(seq + "\n") file_out.close()
import speciesManage speciesFile = './species_list.txt' dataPath_in = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/' suffix_in = '_100bp/dreme.txt' species_to_motifs = {} species_list = speciesManage.generate_list(speciesFile) # READ DREME => species:motifs for species in species_list: print(species) species_to_motifs[species] = [] filename_in = dataPath_in+species+suffix_in file_in = open(filename_in,'r') count = 0 while True: line = file_in.readline() if line == "": break if 'MOTIF' in line: count += 1 motif = line.split(' ')[1].rstrip() # split line where '\t' occurs species_to_motifs[species].append(motif) file_in.close()