Exemple #1
0
def rename_mfannot_proteome(fastas,
                            ids_file,
                            column_id=-1,
                            column_name=-3,
                            headermode=1):
    ids = []
    names = {}
    with open(ids_file, 'r') as idsinput:
        for line in idsinput.readlines():
            id = line.strip().split(',')[column_id].strip()
            name = line.strip().split(',')[column_name].strip()
            if name == "":
                name = "UNKNOWN"
            names[id] = name
    for fas in fastas:
        fasta_stream = fasta.parse(fas)
        # update fasta headers with new names
        fasta_stream, spec_name, spec_id = fasta.set_header(fasta_stream,
                                                            names=names,
                                                            mode=headermode)
        new_filename = spec_name + "_" + spec_id + ".fasta"
        new_filename = new_filename.replace("/", "")
        with open(new_filename, "w") as out:
            print("Generating", new_filename)
            for head, seq in fasta_stream.items():
                out.write(">" + head + "\n" + seq + "\n")
Exemple #2
0
def split_clusters(fastas,
                   clusters,
                   ids=None,
                   filter=0.1,
                   with_treshold=False):
    clusts = load_clusters(clusters)
    all_fastas = {}
    print("Number of species fasta files =", len(fastas),
          "\nNumber of clusters =", len(clusts))
    #max_key, max_value = max(clusts.items(), key = lambda x: len(set(x[1])))
    if ids:
        # add a dict with IDS-name corresp
        ids_names = add_name_from_nc_id(ids)
    for fas in fastas:
        fasta_stream = fasta.parse(fas)
        all_fastas.update(fasta_stream)
    for i, clust in enumerate(clusts):
        clustname = "OG" + format(int(clust.split("_")[-1]), '06d')
        print("[", round(i / len(clusts) * 100), "% ]", "Cluster", clustname,
              "size =", len(clusts[clust]))
        if (with_treshold
                and not len(clusts[clust]) > len(fastas) - len(fastas) * 0.1):
            pass
        else:
            #print("Keeping", clustname, len(clusts[clust]), ">", len(fastas)-len(fastas)*filter)
            with open(clustname + '.fa', 'w') as clust_out:
                for key in clusts[clust]:
                    for fasta_key in all_fastas.keys():
                        if key in fasta_key:
                            clust_out.write(">" + fasta_key + "\n" +
                                            all_fastas[fasta_key] + "\n")
Exemple #3
0
def detect_bad_genomes(fastas):
    duplicates = []
    species_list = []
    genes = {}
    for fas in fastas:
        fasta_stream = fasta.parse(fas)
        for gene in list(fasta_stream.keys()):
            #print(gene.split('_')[0], gene.split('_')[1])
            gene_name = gene.split('_')[0]
            if len(gene.split('_')[1]) == 1:
                species_id = gene.split('_')[2]
            else:
                species_id = gene.split('_')[1]
            print(species_id, gene.split('_'))
            if gene_name in genes.keys():
                genes[gene_name].append(species_id)
            else:
                genes[gene_name] = [species_id]

    for gene in genes:
        for species in genes[gene]:
            if species not in species_list:
                species_list.append(species)
            if genes[gene].count(species) > 1:
                #print(gene, species)
                #if gene in ['cox1', 'cox2', 'cob']:
                if species not in duplicates:
                    duplicates.append(species)
    print(duplicates, len(duplicates), len(species_list))
    for species in species_list:
        if species not in duplicates:
            #print(species)
            continue
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)
    profile_matrix = profile.profile_matrix(fastas)
    print consensus_string(profile_matrix)
    profile.pretty_print(profile_matrix)
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)
    profile_matrix = profile.profile_matrix(fastas)
    print consensus_string(profile_matrix)
    profile.pretty_print(profile_matrix)
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)
    
    # get the highest gc percentage fasta and print it pretty as a percentage
    sorted_gc = sorted(gc_content(fastas).items(), key=lambda t: -t[1])
    print sorted_gc[0][0] + ' ' + str(sorted_gc[0][1] * 100)
Exemple #7
0
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)

    # get the highest gc percentage fasta and print it pretty as a percentage
    sorted_gc = sorted(gc_content(fastas).items(), key=lambda t: -t[1])
    print sorted_gc[0][0] + ' ' + str(sorted_gc[0][1] * 100)
Exemple #8
0
	def get_input(self):
		'''Open a single .seq, .fasta, .fastq, .scf or .ab1 file and set variables accordingly.'''
		parts = self.filepath.split('/')
		filename = parts.pop() #get filename
		self.name = filename
		path = '/'.join(parts) #path to file
		
		#establish type of input file
		if '.' in filename: 
			self.input_type = filename.split('.').upper() 
		else:
			self.input_type = None
		
		#establish orientation of DNA
		if filename[-2:].upper() == 'FW':
			self.SetOrientation('fw')
		elif filename[-2:].upper() == 'RV':
			self.SetOrientation('rv')
		else:
			raise TypeError, 'The last two characters of the filename (before the .) must specify whether the sequence is fw or rv. Pleace rename file %s accordingly' % filename
		
		#read the input
		if self.input_type in ['TXT', 'SEQ', None]:
			f = open(filepath, 'r') 
			dna = f.read() 
			self.SetDNA(dna.replace('\n', ''))
			#add an assert that there are only dna bases here
			f.close()
			
		elif self.input_type in ['AB1', 'ABI']):
			ab1 = ABIreader.Trace(filepath, trimming=False) #optionally ', trimming=True'
			self.SetDNA(ab1.seq)
			self.SetQualVal(ab1.qual_val)
			self.SetTrace([AB1Trace.data['raw1'], AB1Trace.data['raw2'], AB1Trace.data['raw3'], AB1Trace.data['raw4']])
			#abi=dict(baseorder=ab1.data['baseorder'], qual_val=ab1.qual_val, G=str(AB1Trace.data['raw1']), A=str(AB1Trace.data['raw2']), T=str(AB1Trace.data['raw3']), C=str(AB1Trace.data['raw4']))

		elif self.input_type == 'ABIF':
			print('Support for .abif files has not yet been implemented')

			elif self.input_type == 'ZTR':
			print('Support for .ztr files has not yet been implemented')
			
		elif self.input_type == 'SCF':
			print('Support for .scf files has not yet been implemented')
			
		elif fnmatch.fnmatch(filename, '*.fasta'):
			id, dna = fasta.parse(self.filepath) #parse the fasta file. File should contain ONE entry
			self.SetDNA(dna)
			
		elif fnmatch.fnmatch(filename, '*.fastq'):
			id, dna, id2, qual_val = fastq.parse(self.filepath) #parse the fastq file. File should contain ONE entry
			self.SetDNA(dna)
			self.SetQualVal(qual_val)
			
		else:
			raise TypeError, '"%s" is not a .txt, .seq, .scf, .fasta, .fastq, .abif, .abi or .ztr file' % filename
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)
    strings = []
    for f in fastas:
    	strings.append(f.content)
    first = strings[0]
    intersect_set = permutation_set(first)
    for s in strings[1:]:
    	intersect_set = intersect_set & permutation_set(s)
    print max(intersect_set, key=len)
Exemple #10
0
def main():
    args = []
    for line in fileinput.input():
        args.append(line.rstrip())
    fastas = fasta.parse(args)
    strings = []
    for f in fastas:
        strings.append(f.content)
    first = strings[0]
    intersect_set = permutation_set(first)
    for s in strings[1:]:
        intersect_set = intersect_set & permutation_set(s)
    print max(intersect_set, key=len)
Exemple #11
0
def list_common_genes(fastas, gene_pos=0, id_position=-1, delimiter="~"):
    genes = {}
    all_fastas = {}
    for k, fas in enumerate(fastas):
        fasta_stream = fasta.parse(fas)
        all_fastas.update(fasta_stream)
        for head, seq in fasta_stream.items():
            gene_head = head.split(delimiter)
            if gene_head[gene_pos] not in genes.keys():
                genes[gene_head[gene_pos]] = []
            genes[gene_head[gene_pos]].append(head)
    #print(genes.keys())
    for gene, entry in genes.items():
        if len(entry) == min([len(entry) for entry in genes.values()]):
            #print(gene)
            smallest_subset = []
            for ent in sorted(entry):
                #print(ent)
                smallest_subset.append(ent.split(delimiter)[id_position])
    kept_genes = {}
    kept_species = {}
    for gene, entry in genes.items():
        for ent in entry:
            if ent.split(delimiter)[id_position] in smallest_subset:
                if gene not in kept_genes.keys():
                    kept_genes[gene] = []
                if ent not in kept_species.keys():
                    kept_species[ent] = []
                kept_genes[gene].append(ent)
                kept_species[ent].append(gene)
    #print(all_fastas)
    for gene, heads in kept_genes.items():
        #print(head, all_fastas[head])
        with open(gene + ".fasta", "w") as output:
            for header in heads:
                #all_fastas[header]
                output.write(">" + header + "\n" + all_fastas[header] + "\n")
    # concat
    max_len = 70
    concat = ""

    for head in kept_species:
        concat += all_fastas[head]
        chunks = [
            concat[i:i + max_len] for i in range(0, len(concat), max_len)
        ]
        with open(head.split(delimiter)[-1] + ".concat.fasta", "w") as output:
            output.write(">" + delimiter.join(head.split(delimiter)[-2:]) +
                         "\n")
            for elem in chunks:
                output.write(elem + "\n")
Exemple #12
0
def remove_spurious_clusters(fastas, clusters, ids=None):
    clusts = load_clusters(clusters)
    max_key, max_value = max(clusts.items(), key=lambda x: len(set(x[1])))
    if ids:
        # add a dict with IDS-name corresp
        ids_names = add_name_from_nc_id(ids)
    for fas in fastas:
        fasta_stream = fasta.parse(fas)
        with open(fas + '.fasta', 'w') as fas_out:
            for key in fasta_stream:
                print('_'.join(key.split('_')[-3:-1]))
                if key in max_value:
                    fas_out.write(">" + key + '\n')
                    fas_out.write(fasta_stream[key] + '\n')
Exemple #13
0
def ortho_intersect(fastas):
    group = {}
    all_fastas = {}
    # parse fastas and build all groups
    for k, fas in enumerate(fastas):
        fasta_stream = fasta.parse(fas)
        all_fastas.update(fasta_stream)
        group[k] = []
        for key in fasta_stream.keys():
            if len(key.split("~")) != 3:
                new_head = [
                    key.split("_")[0], "_".join(key.split("_")[1].split('-')),
                    '_'.join(key.split("_")[2:])
                ]
                new_head = '~'.join(new_head)
                group[k].append(new_head)
            else:
                group[k].append(key)
    new_group = {}
    for grp_a in group:
        new_group[grp_a] = []
        for grp_b in group:
            if grp_b == grp_a:
                break
            for key_a in group[grp_a]:
                if key_a in group[grp_b]:
                    new_group[grp_a].append(key_a)
        print(round(grp_a / len(group) * 100), "%")
    print("-----COMP-----")
    # for k in new_group:
    #     print(len(new_group[k]))

    for i, k in enumerate(
            sorted(new_group, key=lambda k: len(new_group[k]), reverse=True)):
        if i > 20:
            break
        print(k, len(new_group[k]))
        with open("GROUP_" + str(i), "w") as output:
            for entry in new_group[k]:
                #########
                # if entry.startswith("orf"):
                #     continue
                # else:
                #     output.write(">"+entry+"\n"+all_fastas[entry]+"\n")
                #########
                output.write(">" + entry + "\n" + all_fastas[entry] + "\n")
Exemple #14
0
def subset_seq_from_tax(keyword, taxonomy, fastas):
    """
    input:
    -----
    keyword (str): sordariales
    fastas (list): a list of fasta files to analyse
    
    Called using -selectgroup flag :
    mtanalysis -selectgroup "sordariales" taxonomy.csv cox1.fasta

    This will select only sequences with a specific property from taxonomy.
    The Sequences and taxonomy are link using a corresponding CSV file, linking ID to Taxonomy.

    """
    try:
        keywds = keyword.split(";")
        if len(keywds) > 1:
            print(keywds)
    except:
        keywds = keyword

    taxonomy_retain = []
    with open(taxonomy, 'r') as taxo_f:
        for line in taxo_f.readlines():
            if type(keywds) == list:
                for keyword in keywds:
                    if keyword.lower() in line.lower():
                        #print(keyword)
                        retained = line.split(",")[-1]
                        taxonomy_retain.append(retained.strip())
            else:
                if keyword.lower() in line.lower():
                    #print(keyword)
                    retained = line.split(",")[-1]
                    taxonomy_retain.append(retained.strip())
    print(taxonomy_retain)
    for k, fas in enumerate(fastas):
        fasta_stream = fasta.parse(fas)
        with open(fas + ".ret", "w") as output:
            for key, seq in fasta_stream.items():
                if key.split("~")[-1] in taxonomy_retain:
                    ret = ">" + key + "\n" + seq + "\n"
                    output.write(ret)
Exemple #15
0
def rename_from_id(ids_file,
                   fastas,
                   delimiter="~",
                   id_indice=-1,
                   species_name_pos=-3):

    names = {}
    with open(ids_file, "r") as idsinput:
        for line in idsinput.readlines():
            line = line.strip()
            id = line.split(",")[id_indice].strip()
            names[id] = line.split(",")[species_name_pos].strip()
    for k, fas in enumerate(fastas):
        fasta_stream = fasta.parse(fas)
        with open(fas, "w") as output:
            for key, seq in fasta_stream.items():
                head_lst = key.split(delimiter)
                id = head_lst[id_indice]
                #gene = head_lst[0]
                #species_name = head_lst[1]
                head_lst[1] = "_".join(names[id].split(" "))
                head = delimiter.join(head_lst)
                ret = ">" + head + "\n" + seq + "\n"
                output.write(ret)
Exemple #16
0
    parser.add_argument('Reads', type=str, nargs='?', default='-',
                        help='Reference file or - for stdin')
    parser.add_argument('-o', dest='output', action='store', default='-',
                        help='Output fasta or - for stdout')
    parser.add_argument('--rev', action='store_true',
                        help='Consider reverse strip')
    args = parser.parse_args()

    if args.Reads == '-':
        f = sys.stdin
    else:
        f = open(args.Reads, 'rb')

    if args.output == '-':
        g = sys.stdout
    else:
        g = open(args.output, 'wb')

    F = [s for _, s in parse(f)]
    if args.rev:
        # TODO: Parse reverse strip fragments
        pass

    for edge in multi_graph(F):
        g.write("%s\n" % " ".join([str(i) for i in edge]))

    if args.Reads != '-':
        f.close()
    if args.output != '-':
        g.close()
Exemple #17
0
# http://rosalind.info/problems/grph/

import fasta


def isConnected(s, t, k):
    """ 
    Returns true if string s has a suffix of length k equal 
    to the prefix of t of length k.
    """
    return s[-k:] == t[:k]


entries = list(fasta.parse())
edges = ((a, b) for i, a in enumerate(entries) for j, b in enumerate(entries)
         if a != b and isConnected(a[1], b[1], 3))

for v1, v2 in edges:
    print v1[0], v2[0]
Exemple #18
0
def build_dna_table():
    f = open("table.txt").read().split()
    return dict(zip(f[0::2], f[1::2]))

def dna_to_proteins(dna, dna_table):
    codons = [dna[i : i+3] for i in xrange(0, len(dna) - 2, 3)]

    for i, codon in enumerate(codons):
        if codon == "ATG":
            protein = ["M"]
            for codon2 in codons[i+1:]:
                if codon2 in ("TAA", "TAG", "TGA"):
                    yield ''.join(protein)
                    break
                else:
                    protein.append(dna_table[codon2])

dna_table = build_dna_table()
dna = next(fasta.parse())[1]
complement = dna[::-1].translate(string.maketrans("ATCG", "TAGC"))
reading_frames = [dna[i:] for i in xrange(3)] + [complement[i:] for i in xrange(3)]

proteins = set()
for reading_frame in reading_frames:
    for protein in dna_to_proteins(reading_frame, dna_table):
        if protein:
            proteins.add(protein)

print "\n".join(proteins)
Exemple #19
0
# http://rosalind.info/problems/lcsm/

import fasta, collections

strings = [s for desc, s in fasta.parse()]
subs = strings[0]
found = []

for i in xrange(len(subs)):
    indexes = collections.defaultdict(int)

    for j in xrange(1, len(subs) - i + 1):
        substr = subs[i : i+j]
        common = True

        for s in strings:
            lastFound = indexes[s]
            index = s.find(substr, lastFound)

            if index == -1:
                common = False
                break
            else:
                indexes[s] = index

        if common:
            found.append(substr)
        else:
            break

print max(found, key=len)
Exemple #20
0
# http://rosalind.info/problems/gc/

import fasta


def gcContent(s):
    return float(sum(1 for c in s if c == 'G' or c == 'C')) / len(s)


entries = fasta.parse()
contents = ((d, gcContent(s)) for d, s in entries)
desc, maxGcContent = max(contents, key=lambda x: x[1])

print desc
print maxGcContent * 100
Exemple #21
0
                        type=int, help='Maximum read length')
    parser.add_argument('--coverage', type=float, action='store', default=11,
                        help='Desired coverage of given RefSeq')
    parser.add_argument('--error', type=float, action='store', default=0,
                        help='Error rate, in %')
    parser.add_argument('--rev', type=float, action='store', default=0,
                        help='Reverse strip rate, in %')
    args = parser.parse_args()

    if args.RefSeq == '-':
        f = sys.stdin
    else:
        f = open(args.RefSeq, 'rb')

    if args.output == '-':
        g = sys.stdout
    else:
        g = open(args.output, 'wb')

    seqs = [s for _, s in parse(f)]

    for i, fragment in enumerate(split_reads(seqs, args.min_size,
                                             args.max_size, args.coverage,
                                             args.error, args.rev)):
        write(fragment, 'fragment_%d' % i, g)

    if args.RefSeq != '-':
        f.close()
    if args.output != '-':
        g.close()
Exemple #22
0
# http://rosalind.info/problems/lcsm/

import fasta, collections

strings = [s for desc, s in fasta.parse()]
subs, longest = strings[0], ""

for i in xrange(len(subs)):
    indexes = collections.defaultdict(int)

    for j in xrange(1, len(subs) - i + 1):
        substr = subs[i : i+j]
        if all(substr in s for s in strings):
            longest = max(substr, longest, key=len)
        else:
            break

print longest
Exemple #23
0
# http://rosalind.info/problems/gc/

import fasta

def gcContent(s):
    return float(sum(1 for c in s if c == 'G' or c == 'C')) / len(s)

entries = fasta.parse()
contents = ((d, gcContent(s)) for d, s in entries)
desc, maxGcContent = max(contents, key=lambda x: x[1])

print desc 
print maxGcContent * 100
Exemple #24
0
# http://rosalind.info/problems/grph/

import fasta

def isConnected(s, t, k):
    """ 
    Returns true if string s has a suffix of length k equal 
    to the prefix of t of length k.
    """
    return s[-k:] == t[:k]

entries = list(fasta.parse())
edges = ((a, b) for i, a in enumerate(entries)
                  for j, b in enumerate(entries)
                    if a != b and isConnected(a[1], b[1], 3))

for v1, v2 in edges:
    print v1[0], v2[0]