Ejemplo n.º 1
0
def sampleAllSpecies(speciesFile='./species_list.txt',sample_directions=['upstream','downstream'], dataPath_out='../data/sample_seqs/fasta/', sample_range=2000, take_annotated_utr=False, masking='hard'):
    """
    Notes:
        samples sequences up/downstream from each gene of each genome of a list of species generating fasta files
    Args:
        species_list        = ['Anopheles gambiae', 'Aedes aegypti']    #list: species names to sample from
        sample_directions   = ['upstream','downstream']                 #list: list of directions to sample seqs from

    Dependencies:
        --> 
            data/sample_seqs/fasta/<species_name>_<up/down>stream.fasta  
                -->
                    meme_dataPrepper.py
                    meme_bgfileGen.py 
        <-- 
            speciesManage.generate_list.py
            sampleSequences_write()
                <--
                    sampleSequences_read()
                        <--
                            setupGenome()
    """
    import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases
    species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names

    for sample_direction in sample_directions:
        for species in species_list:
            print(species)
            genome          = setupGenome(              species, db_host='localhost', db_user='******', db_pass='******', db_release=73 )
            samples_read    = sampleSequences_read (    genome, sample_direction=sample_direction, sample_range=sample_range, take_annotated_utr=take_annotated_utr, masking = masking, sample_data={}, sample_seqs={}, test_it=False)
            samples_write   = sampleSequences_write(    genome, samples_read, fasta_it=True, pickle_it=True, include_genes=False)
            import pprint
            pprint.pprint(samples_write[1]) # show the header
Ejemplo n.º 2
0
def allSpecies( speciesFile = './species_list.txt', dataPath_in = '../data/meme_data/in/', dataPath_out='../data/meme_data/in/randomFasta/', n_replicates=5, n_seqs=2000 ):

    import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases
    species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names

    for species in species_list:
        print(species)
        suffix_in   = '_upstream_memeready_all_simpleMasked.fasta'
        filename_in = dataPath_in+species+suffix_in
        file_in     = open(filename_in)
        headers     = {}
        seqs        = {}
        
        dataPath_species = dataPath_out+species+'/'

        if not os.path.exists(dataPath_species):
            os.makedirs(dataPath_species) # New dir for species
        else:
            print('\tSamples already AVAILABLE for: '+species)
            continue

        # IMPORT POPULATION FASTA
        print '\tImporting FASTA...'
        while True:
            header  = file_in.readline()
            sequence= file_in.readline()
            if header == '':                    # break when finished
                break
            header_split    = header.split('\t')
            geneId          = header_split[0].replace('>','')
            headers[geneId] = header
            seqs[geneId]    = sequence
        file_in.close()

        # RANDOM SAMPLING LIST
        if os.path.isfile(dataPath_species+'randomList_geneIds.p'):
            print('\tRandom Fasta: AVAILABLE -> Importing...')
            randList_geneIds= pickle.load(open(dataPath_species+'randomList_geneIds.p','rb'))
        else:
            print('\tRandom Fasta: UNAVAILABLE -> Generating...')
            randList        = [[random.randrange(0,len(seqs.keys())-1) for seq in range(0,n_seqs)] for replicate in range(0,n_replicates)]
            randList_geneIds= [[headers.keys()[seq] for seq in randList[replicate]] for replicate in range(0,len(randList))]
            pickle.dump(randList_geneIds,open(dataPath_species+'randomList_geneIds.p','wb'))

        # GENERATE SAMPLE FASTA
        for i,replicate in enumerate(randList_geneIds):
            suffix_out  = suffix_in.replace('_all_','_'+str(n_seqs)+'_'+str(i)+'_')
            #                                ^repl -->    ^#seqs   ^replicate id
            filename_out= dataPath_species+species+suffix_out
            file_out    = open(filename_out,'w')
            for seq in replicate:
                h = headers[seq]
                s = seqs[seq]
                file_out.write(h+s)
            file_out.close()
Ejemplo n.º 3
0
def allSpecies(speciesFile, dataPath_in, dataPath_out, resultFormat, verbosity, eValue, nMotifs):
    """
    Notes:

    Args:
        speciesFile     = './species_list.txt'
        dataPath_in     = '../data/meme_data/in/random_dreme/'
        dataPath_out    = '../data/meme_data/out/'
        verbosity       = 2
        eValue          = 0.0001
        nMotifs         = 100
        resultFormat    = 'png'
        hpc             = True                                  # is this run via HPC?
    """

    suffix_in = "_upstream_dremeready_all_simpleMasked_random.fasta"
    suffix_out = "_100bp"
    species_list = speciesManage.generate_list(
        speciesFile
    )  # generates a list of species names corresponding to EnsEMBl MySQL db name

    for species in species_list:

        filename_in = dataPath_in + species + suffix_in

        dreme_out = dataPath_out + species + suffix_out
        # dreme -png -v 1 -oc . -t 18000 -p anopheles_gambiae_upstream_memeready_all_simpleMasked.fasta -e 0.05 -m 100 -dfile description
        dreme = [
            "dreme",  # DREME program
            "-" + resultFormat,  # result format
            "-v",
            str(verbosity),  # verbosity, 1:5
            "-oc",
            dreme_out,  # over-write directory with <name> and write in the results
            #'-t','18000',              # time-limit
            "-p",
            filename_in,  # positive file: input fasta data
            #'-n', ... ,                # negative file: the null model of fasta sequences, i.e. dinucleotide shuffling
            "-e",
            str(eValue),  # E-value cut-off, ignores the list of motifs with < -e from running the long analysis on
            "-m",
            str(nMotifs),  # n_motifs to search for, runtime scales linearly w/ acceleration
        ]

        # if hpc==True:
        #     dreme = ['qsub']+dreme

        all_dreme = [dreme]

        for dreme in all_dreme:
            # start   = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0]
            subprocess.call(dreme)
Ejemplo n.º 4
0
def allSpecies( speciesFile, motifsPath_in, fastaPath_in, dataPath_out, verbosity, threshold ):
    """
    Notes:

    Args:
        speciesFile     = './species_list.txt'
        motifsPath_in   = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/'
        fastaPath_in    = '../data/meme_data/in/random_dreme/'
        dataPath_out    = '../data/meme_data/out/fimo/'
        verbosity       = 2                                                     # 
        threshold       = 0.1                                                   # q-value cut-off for bad matches filter
    """

    species_list    = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names

    if not os.path.exists(dataPath_out):
    	os.makedirs(dataPath_out)

    suffix_in_motifs= '_100bp/dreme.html'
    suffix_in_fasta = '_upstream_dremeready_all_simpleMasked_random.fasta'
    suffix_out      = '_100bp'
    
    for species in species_list:
        print(species)
        motifs_in = motifsPath_in+ species + suffix_in_motifs
        fasta_in = fastaPath_in + species + suffix_in_fasta
        fimo_out = dataPath_out + species + suffix_out
        # COMMAND LINE: fimo -oc fimo_100bp/ -thresh 0.1 -verbosity 2 ./dreme_100bp/sampled_all_hpc/anopheles_christyi_100bp/dreme.html ./dreme_100bp/sampled_all_hpc/anopheles_merus_100bp/dreme.html
        fimo = [    'fimo',                           # fimo program
                    '-verbosity',   str(verbosity),     # verbosity, 1:5
                    '-oc',          fimo_out,          # over-write directory with <name> and write in the results
                    '-thresh',      str(threshold),     # significance threshold, q-value
                    motifs_in,                  # positive file: input fasta data
                    fasta_in
                    ]
        all_fimo = [fimo]
        for fimo in all_fimo:
            #start   = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0]
            subprocess.call(fimo)
Ejemplo n.º 5
0
def allSpecies( speciesFile, dataPath_in, dataPath_out, verbosity, threshold ):
    """
    Notes:

    Args:
        speciesFile     = './species_list.txt'
        dataPath_in     = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/'
        dataPath_out    = '../data/meme_data/out/tomtom_100bp/'
        verbosity       = 2                                                     # 
        threshold       = 0.1                                                   # q-value cut-off for bad matches filter
    """
    suffix_in       = '_100bp/dreme.html'
    suffix_out      = '_100bp'
    species_list    = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names
    speciesPairs    = list(itertools.product(species_list,species_list))
    for speciesPair in speciesPairs:
        print(speciesPair)
        species_query       = speciesPair[0]
        species_database    = speciesPair[1]
        filename_in_query   = dataPath_in + species_query    + suffix_in
        filename_in_database= dataPath_in + species_database + suffix_in
        tomtom_out          = dataPath_out+species_query+'_vs_'+species_database+suffix_out
        # COMMAND LINE: tomtom -oc tomtom_100bp/ -thresh 0.1 -verbosity 2 ./dreme_100bp/sampled_all_hpc/anopheles_christyi_100bp/dreme.html ./dreme_100bp/sampled_all_hpc/anopheles_merus_100bp/dreme.html
        tomtom = [  'tomtom',                           # Tomtom program
                    '-verbosity',   str(verbosity),     # verbosity, 1:5
                    '-oc',          tomtom_out,          # over-write directory with <name> and write in the results
                    '-thresh',      str(threshold),     # significance threshold, q-value
                    filename_in_query,                  # positive file: input fasta data
                    filename_in_database 
                    ]
        all_tomtom = [tomtom]
        for tomtom in all_tomtom:
            start   = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0]
            subprocess.call(tomtom)
            end     = subprocess.Popen(['date'], stdout=subprocess.PIPE).communicate()[0]
            tdelta  = datetime.strptime(end.split(' ')[3], '%H:%M:%S') - datetime.strptime(start.split(' ')[3], '%H:%M:%S')
            print(tdelta)
Ejemplo n.º 6
0
def allSpecies( dataPath_in, bfileGeneratorPath, speciesFile='./species_list.txt', maskingChar = 'n', order=3 ):
    """
    Notes:
        Iterates meme_bfileGenerator for all species listed

    Dependencies:
        Executed:   see meme_bfileGenerator.py 
        Scripts:    see meme_bfileGenerator.py

    Args: 
        dataPath_in = '0VB/2kb/data/meme_data/in/'      # path of directory with allSpecies memeready fasta files
    """
    print('Generating bfiles for all species...')
    import speciesManage # species.generate_list() method creates a python list with names of species corresponding to MySQL EnsEMBL databases
    species_list = speciesManage.generate_list(speciesFile) # generates a list of species names corresponding to EnsEMBl MySQL db names
    for species in species_list:
        print(species)
        #file_in     = dataPath_in+species.lower().replace(' ','_')+'_upstream_memeready_all_dusted.fasta'   # ANDY 25_02
        file_in     = dataPath_in+species.lower().replace(' ','_')+'_upstream_memeready_all.fasta'           # ANDY 25_02
        file_out    = dataPath_in+species.lower().replace(' ','_')+'.bg2'

        print file_out

        meme_bfileGenerator( file_in, file_out, bfileGeneratorPath, maskingChar, order ) 
Ejemplo n.º 7
0
def allSpecies(speciesFile, dataPath_in, dataPath_out, n_seqs=10000, len_seq=100, population=True):
    """
    Notes: 

    Args:
        speciesFile = './species_list.txt'                  # location of species config file
        dataPath_in = '../data/meme_data/in/'               # input directiory
        dataPath_out='../data/meme_data/in/random_dreme/'   # output directory
        n_replicates=5                                      # number of replicates per species
        n_seqs      =10000                                  # 10k sampled seqs
        len_seq     =100                                    # 100bp long sequence samples
    """

    def rand_parts(seq, n, l):
        indices = xrange(len(seq) - (l - 1) * n)
        result = []
        offset = 0
        for i in sorted(random.sample(indices, n)):
            i += offset
            result.append(seq[i : i + l])
            offset += l - 1
        return result

    def chunks(l, n):
        """ Yield successive n-sized chunks from l. """
        for i in xrange(0, len(l), n):
            yield l[i : i + n]

    sys.path.append(os.getcwd())
    suffix_in = "_upstream_memeready_all_simpleMasked.fasta"  # common filename component between al sp.
    suffix_out = suffix_in.split(".")[0].replace("memeready", "dremeready") + "_random" + "." + suffix_in.split(".")[1]
    species_list = speciesManage.generate_list(
        speciesFile
    )  # generates a list of species names corresponding to EnsEMBl MySQL db names
    for species in species_list:
        print (species)
        # IMPORT FASTA                      :
        filename_in = dataPath_in + species + suffix_in
        filename_out = dataPath_out + species + suffix_out
        file_in = open(filename_in)
        headers = {}  # dict for geneId:header
        seqs = {}  # geneId:seq
        print "\tImporting FASTA..."
        while True:
            header = file_in.readline()
            seq = file_in.readline()
            if header == "":
                break
            header_split = header.split("\t")
            geneId = header_split[0].replace(">", "")
            mask = re.compile(r"(N)\1{2,}")  # collapse consecutive masking chars, NNN --> N
            seq_collaspedMasks = mask.sub("N", seq).rstrip() + "N"
            #                                                ^ add X to delimit each sequence later when they are concated
            headers[geneId] = header
            seqs[geneId] = seq_collaspedMasks
        file_in.close()
        # META SEQUENCE                     : concatenate all sequences for later random sampling
        print ("\tconcatenating sequences to form meta-sequence...")
        meta_seq = "".join([seqs[i] for i in seqs.keys()])
        meta_index = range(0, len(meta_seq))

        # LOCATION:GENEID                   : map each bp location of the meta_sequence to geneId it belongs to, ~14mil long meta_seq becomes its own key:geneId
        print ("\tmapping meta-sequence to geneIds...")
        loc2GeneId = {}
        lens = [len(seqs[i]) for i in seqs.keys()]  # lengths of each seq
        start = 0
        for k, geneId in enumerate(seqs.keys()):
            start = start
            end = start + lens[k]
            seqRange = range(start, end)
            for loc in seqRange:
                loc2GeneId[loc] = geneId
            start = end
        # RANDOM NON_OVERLAPPING SAMPLING   : sample N random subranges along the meta_seq, representing non-overlapping sub_seqs length L
        print ("\tgenerating random sub-sequence samples...")
        if population == False:
            randRanges = rand_parts(meta_index, n_seqs, len_seq)
        elif population == True:
            randRanges = chunks(meta_index, len_seq)
        randSamples = []
        for randRange in randRanges:
            geneSet = list(
                set([loc2GeneId[loc] for loc in randRange])
            )  # collect geneIds whose sequences lie in the randRange
            randSamples.append(
                {
                    "geneId": geneSet,  # for the given rangeRange take note of the geneIds within
                    "seq": meta_seq[randRange[0] : randRange[-1]],  # store the sub_seq corresponding to the randRange
                }
            )
        # GENERATE FASTA
        print ("\twriting sampled sequences to FASTA...")

        if not os.path.exists(dataPath_out):  # generate cluster fasta dirs
            print ("making directory: " + dataPath_out)
            os.makedirs(dataPath_out)

        file_out = open(filename_out, "w")
        for sample in randSamples:
            headers = sample["geneId"]
            seq = sample["seq"]
            file_out.write(">" + "\t".join(headers) + "\n")
            file_out.write(seq + "\n")
        file_out.close()
Ejemplo n.º 8
0
import speciesManage

speciesFile         = './species_list.txt'
dataPath_in         = '../data/meme_data/out/dreme_100bp/sampled_all_hpc/'
suffix_in           = '_100bp/dreme.txt'
species_to_motifs   = {}
species_list        = speciesManage.generate_list(speciesFile)

# READ DREME => species:motifs
for species in species_list:
    print(species)
    species_to_motifs[species]  = []
    filename_in                 = dataPath_in+species+suffix_in
    file_in                     = open(filename_in,'r')
    count                       = 0
    while True:
        line = file_in.readline()
        if line == "":
            break
        if 'MOTIF' in line:
            count += 1
            motif = line.split(' ')[1].rstrip()   # split line where '\t' occurs
            species_to_motifs[species].append(motif)
    file_in.close()