def main(dna_file, protein_file=None, output_h=sys.stdout):
    output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n")
    work_dir = op.join(os.getcwd(), "syn_analysis")
    if not op.exists(work_dir):
        os.makedirs(work_dir)

    if not protein_file:
        protein_file = translate_dna(dna_file)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write(
                    "%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))
                )
                output_h.flush()

    # Cleanup
    for f in ("2YN.dN", "2YN.dS", "2YN.t", "rst", "rst1", "rub"):
        if op.exists(f):
            os.remove(f)
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(error.strip() == "" or
               error.startswith("WARNING: Sequence type is DNA.") or
               error.startswith("WARNING: DNA alignment is still experimental."))

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
Example #3
0
def load_examples_from_fasta(signal, org, data_path):
    """
    load examples from fasta file

    signal 
    """

    fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos")
    fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg")
    print "loading: \n %s \n %s" % (fn_pos, fn_neg)

    # parse file
    xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")]
    xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")]

    labels = [+1] * len(xt_pos) + [-1] * len(xt_neg)
    examples = xt_pos + xt_neg

    print (
        "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i"
        % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg))
    )

    examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels)
    ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)}

    return ret
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]

    try:
        if args.gff is not None and args.fasta is not None:
            load_gff(db, args.gff, args.fasta, args.tax_lookup, args.taxid)
        elif args.genbank is not None:
            load_genbank(db, args.genbank, args.tax_lookup, args.taxid)
    except:
        server.adaptor.rollback()
        raise

    if args.new_taxons:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

        if args.fasta is not None:
            gen = SeqIO.parse(args.fasta, 'fasta')
        elif args.genbank is not None:
            gen = SeqIO.parse(args.genbank, 'genbank')

        for rec in gen:
            server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec.name)))

    server.commit()
Example #5
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     #Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     #And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         #TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     #Done
     server.close()
Example #6
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Example #7
0
def run_pal2nal(fname_aln, fname_nuc, fname_prot):
    """
    Generate a codon alignment via PAL2NAL.

    @param fname_aln:
        MSA of protein sequences in CLUSTAL format (.aln)
    @param fname_nuc:
        Nucleotide sequences in FASTA format (.fasta)
    @param fname_prot:
        Protein sequences in FASTA format (.fasta)
    @return:
        Codon alignment in CLUSTAL format (.aln), suitable for codeml
    1"""
    sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc))

    # Reorder fname_nuc according to the order of the proteins in fname_aln, which
    # was reordered due to CLUSTALW2.  Note that the first protein in each of
    # these files remains the same as at the start, however; this first protein
    # is our original query protein.
    nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")]
    prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")]
    records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records))
    fname_nuc2 = "homologs_ordered.dna.fasta"
    with open(fname_nuc2, "w") as f:
        for record in SeqIO.parse(fname_aln, "clustal"):
            SeqIO.write(records_map[record.id], f, "fasta")
    fname_codon = "homologs.codon.aln"
    # TODO: use subprocess
    os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon))
    return fname_codon
Example #8
0
File: z4.py Project: strop/WBO_11
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True):
  ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova'''
  promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")]
  genome = [ x for x in SeqIO.parse("genom.fa", "fasta")]
  if not load_cached:
    promotor_freqs = calc_symbol_freq(promotor_sequences)
    genome_freqs = calc_symbol_freq(genome)
    if save_cache:
      dump_obj(promotor_freqs, Dumpfiles.promotor_freq)
      dump_obj(genome_freqs, Dumpfiles.genome_freq)
  else:
    promotor_freqs = load_obj(Dumpfiles.promotor_freq)
    genome_freqs = load_obj(Dumpfiles.genome_freq)
  
  promotor_counts = calc_counts(promotor_sequences)
  genome_counts = calc_counts(genome)

  print promotor_counts

  promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length])
  genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length])

  for k in promotor_freqs:
    assert(k in genome_freqs)
  for k in genome_freqs:
    assert(k in genome_freqs)

  print promotor_freqs
  (markov, states) = build_markov(genome_freqs, promotor_freqs)

  return (markov, states)
Example #9
0
def cluster_pid(folder):
    result = []
    f_name = folder.split("/")[-1]
    try:
        genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv")
        genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'H**o sapiens')]
        if genes.shape[0] > 0:
            for cluster in set(genes['cluster']):
                pids = []
                accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values
                for seq1 in accs:
                    for seq2 in accs:
                        seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq1]
                        seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq2]
                        aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0]
                        mean_len = (len(aln[0]) + len(aln[1])) / 2
                        pids.append(aln[2] / mean_len)

                n_genes = len(pids)
                mean_pid = np.mean(pids)
                sd_id = np.std(pids)
                result.append(cluster, n_genes, mean_pid, sd_id)
                print(cluster)
        return result
    except OSError:
        return None
Example #10
0
def count_overlap(filename):
    for seq_record in SeqIO.parse(filename, "fasta"):
        for seq_record_1 in SeqIO.parse(filename, "fasta"):
            s1 = seq_record.seq
            s2 = seq_record_1.seq
            if s1 != s2 and s1[-3:] == s2[0:3]:
                print(seq_record.id + " " + seq_record_1.id)
Example #11
0
 def _validate_fasta(self, text):
     try:
         SeqIO.parse(text, 'fasta').next()
         return text
     except StopIteration:
         raise argparse.ArgumentTypeError(
             "{0} is not fasta file".format(text))
Example #12
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]



    gen = []
    if args.fasta is not None:
        for rec in SeqIO.parse(args.fasta, 'fasta'):
            gen.append(rec.name)
    elif args.genbank is not None:
        for rec in SeqIO.parse(args.genbank, 'genbank'):
            gen.append(rec.name)
    elif args.input is not None:
        with open(args.input) as fp:
            for line in fp:
                gen.append(line.rstrip())

    if args.remove:
        taxon_id = None
    else:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

    for rec in gen:
        server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec)))
    server.commit()
Example #14
0
    def _set_seqFormat(self, inFile, nlines=100):
        """Determining the format of the seuqence file.

        Args:
        inFile -- file name
        nlines -- number of lines in file to check (starting from top)

        Attrib set:
        readFileForamt -- set to fasta or fastq or NoneType
        """
        with open(inFile, 'r') as fd:
            head = ''.join([fd.readline() for x in xrange(nlines)])

        # format?
        nseqs_fasta = len( [seq_rec.id for seq_rec in SeqIO.parse(StringIO(head), 'fasta')] )
        nseqs_fastq = len( [seq_rec.id for seq_rec in SeqIO.parse(StringIO(head), 'fastq')] )
        
        if nseqs_fasta > 0 and nseqs_fastq > 0:
            if nseqs_fasta > nseqs_fastq:
                self.set_readFileFormat('fasta')
            elif nseqs_fasta < nseqs_fastq:
                self.set_readFileFormat('fastq')
            else:
                raise IOError('  The file appears to be both fasta and fastq\n')                
        elif nseqs_fasta > 0:
            self.set_readFileFormat('fasta')
        elif nseqs_fastq > 0:
            self.set_readFileFormat('fastq')
        else:
            self.set_readFileFormat(None)
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""
        self.assertTrue(str(eval(repr(cline))) == str(cline))
        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"),
                                      lambda rec : rec.id.replace(":", "_"))

        #Determine name of tree file
        if cline.newtree:
            tree_file = cline.newtree
        else:
            #Clustalw will name it based on the input file
            tree_file = os.path.splitext(cline.infile)[0] + ".dnd"

        # Mark generated files for later removal
        self.add_file_to_clean(cline.outfile)
        self.add_file_to_clean(tree_file)

        output, error = cline()
        self.assertTrue(output.strip().startswith("CLUSTAL"))
        self.assertTrue(error.strip() == "")

        #Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        #The length of the alignment will depend on the version of clustalw
        #(clustalw 2.1 and clustalw 1.83 are certainly different).
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile,"clustal"))
        self.assertTrue(set(input_records.keys()) == set(output_records.keys()))
        for record in align:
            self.assertTrue(str(record.seq) == str(output_records[record.id].seq))
            self.assertTrue(str(record.seq).replace("-", "") ==
                   str(input_records[record.id].seq))

        #Check the DND file was created.
        #TODO - Try and parse this with Bio.Nexus?
        self.assertTrue(os.path.isfile(tree_file))
			#We are going to normalise this sequence with the sklearn preprocessing algorithm to see what happens.		
			queryarray = sklearn.preprocessing.scale(np.array(squiggle),axis=0,with_mean=True,with_std=True,copy=True)
			
			dist, cost, path = mlpy.dtw_subsequence(queryarray,kmerhash2[id][ref]['Fprime'])
			result.append((dist,id,"F",path[1][0],ref))
			dist, cost, path = mlpy.dtw_subsequence(queryarray,kmerhash2[id][ref]['Rprime'])
			result.append((dist,id,"R",path[1][0],ref))
			

	return sorted(result,key=lambda result: result[0])[0][1],sorted(result,key=lambda result: result[0])[0][0],sorted(result,key=lambda result: result[0])[0][2],sorted(result,key=lambda result: result[0])[0][3],sorted(result,key=lambda result: result[0])[0][4]

######################################################################

#######################################################
# Retrieve a model from the database rather than the  #
# expected data 									  #
#######################################################

def retrieve_model():
	model_kmers = dict()
	db = MySQLdb.connect(host=dbhost, user=dbusername, passwd=dbpass, port=dbport)
	cursor = db.cursor() 
	sql = "SELECT * FROM minion_LomanLabz_013731_11rx_v2_3135.model_data where model like '%template%'"
	cursor.execute(sql)			
	kmerresults = cursor.fetchall()
	for line in kmerresults:
		kmer = line[2]
		mean = line[4]
		#print kmer,mean
		model_kmers[kmer]=mean
	return model_kmers

							
Example #17
0
def extract(arguments):
    """
    Extract a reference alignment from a reference package
    """
    refpkg = arguments.refpkg

    # If not masking, just copy the sequences, reformatting if appropriate
    if not arguments.use_mask:
        with refpkg.open_resource('aln_sto') as input_fp:
            with arguments.output_file as output_fp:
                result = SeqIO.convert(input_fp, 'stockholm', output_fp,
                        arguments.output_format)
        logging.info("Wrote %d sequences", result)
        return

    # Mask will be applied if available
    with refpkg.open_resource('aln_sto') as fp:
        alignment_length = len(next(SeqIO.parse(fp, 'stockholm')))

        # Rewind
        fp.seek(0)
        sequences = SeqIO.parse(fp, 'stockholm')

        try:
            with refpkg.open_resource('mask') as fp:
                mask = AlignmentMask.from_csv_file(fp, alignment_length)
            logging.info("Applying mask - keeping %d/%d positions",
                    mask.unmasked_count, len(mask))
            sequences = mask.mask_records(sequences)
        except KeyError:
            log.warn("No mask found. Extracting all columns.")

        with arguments.output_file as output_fp:
            result = SeqIO.write(sequences, output_fp, arguments.output_format)
        logging.info("Wrote %d sequences.", result)
Example #18
0
def extract_seq_from_file(seq_file, coords_file, output_file):
	# 记录reference sequence名称
	chrs = []

	# 存储片段
	chr_seg = {}

	# 对片段计数
	cnt = 0

	seqio = SeqIO.parse(seq_file, 'fasta')
	for seq_record in seqio:
		chrs.append(seq_record.id)

	with open(coords_file, 'r') as f:
		for line in f:
			cnt += 1
			line = line.strip('\n')
			regions = re.split('\s+', line)

			if regions[0] not in chrs:
				log.warning('{0} not in reference sequence'.format(regions[0]))

			if len(regions) < 3:
				log.warning('The numbers of this line are less than 3(required)')
				continue

			if regions[0] not in chr_seg:
				chr_seg[regions[0]] = []
				chr_seg[regions[0]].append(regions)
			else:
				chr_seg[regions[0]].append(regions)

	log.info('Summary: {0} chromosomes, {1} segments processed'.format(len(chr_seg), cnt))

	res_file_handle = open(output_file, 'w')

	# 遍历reference sequence
	seqio = SeqIO.parse(seq_file, 'fasta')
	for seq_record in seqio:
		if seq_record.id in chr_seg:
			for seg in chr_seg[seq_record.id]:
				try:
					# 创建SeqRecord对象
					tmp_seq = SeqRecord.SeqRecord(seq=(seq_record.seq)[(int(seg[1])-1):int(seg[2])],
												  id='{0}:{1}..{2}:{3}'.format(seg[0], seg[1], seg[2], seg[3]))

					# 当strang为-时, 进行反向互补处理
					if seg[3] == '-':
						tmp_seq = tmp_seq.reverse_complement(id=True,
															 name=True,
															 description='reverse_complement')

					SeqIO.write(tmp_seq, res_file_handle, 'fasta')
				except Exception as e:
					log.error(e)
		else:
			log.warning(seq_record.id + ' not exists in reference sequences')

	res_file_handle.close()
Example #19
0
def stitch_scaffolds(fa,outFile,len_limit=200000000,dist=500):
    """
    This function merge multiple scaffold together to form a longer sequence. 
    * fa: str. Reference fa file name
    * outFile: str. Filename output to the file.
    * len_limit: int. Maximum length of each merged scaffold.
    * dist: int. Distance between each scaffold.
    """
    in_handle = open(fa,'r')
    out_handle = open(outFile,'w')
    sequence = ''
    n = 1
    for record in SeqIO.parse(in_handle,'fasta'):
        sequence += str(record.seq)
        if len(sequence) >= len_limit:
            item = SeqRecord(Seq(sequence), id = 'chr'+str(n),description="")
            SeqIO.write(item,out_handle,'fasta')
            sequence = ''
            n += 1
        else:
            sequence += 'N'*500
    if sequence != '':
        item = SeqRecord(Seq(sequence[:-500]), id = 'chr'+str(n),description="")
        SeqIO.write(item,out_handle,'fasta')
    # output the last one
    handle = open(outFile)
    for record in SeqIO.parse(handle,'fasta'):
        print len(record.seq)
def Pairfold_Execute_Parallel(Input_sRNA_File,Input_Target_File):
    sRNA_FASTA=list(SeqIO.parse(Input_sRNA_File,"fasta"))
    Target_FASTA=list(SeqIO.parse(Input_Target_File,"fasta"))

    for sRNA in sRNA_FASTA:
        RNAjobList=[]

        #for RNA in Target_FASTA:
        while True:
            try:
                if len(RNAjobList) < args.cpu:
                    RNA=Target_FASTA.pop(0)

                    shell_command="""/home/suu13/misc_stuff/MultiRNAFold-2.0/pairfold "%s" "%s" -m RNA | grep MFE | awk '{print $NF}' """ % (str(sRNA.seq),str(RNA.seq)) #enter pairfold actual path
                    job_id=subprocess.Popen(shell_command,shell=True,stdout=subprocess.PIPE)
                    RNAjobList.append([str(sRNA.description),str(RNA.description),job_id])
                else:
                    time.sleep(0.1) #wait for 0.1s to check status of jobs
                    for job in RNAjobList:
                        if subprocess.Popen.poll(job[2])!=None: #check the status of job object
                            print "%s\t%s\t%s" % (job[0],job[1],job[2].communicate()[0].strip())
                            RNAjobList.remove(job)
            except:
                    while len(RNAjobList) != 0:
                        time.sleep(0.1) #wait for 0.1s to check status of jobs
                        for job in RNAjobList:
                            if subprocess.Popen.poll(job[2])!=None: #check the status of job object
                                print "%s\t%s\t%s" % (job[0],job[1],job[2].communicate()[0].strip())
                                RNAjobList.remove(job)
                    break



    return
Example #21
0
def load_multi_database(gb_filename_or_handle, gb_filename_or_handle2):
    """Load two GenBank files into a new BioSQL database as different subdatabases.

    This is useful for running tests against a newly created database.
    """

    TESTDB = create_database()
    # now open a connection to load the database
    db_name = "biosql-test"
    db_name2 = "biosql-test2"
    server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
    db = server.new_database(db_name)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle, "gb")
    count = db.load(iterator)

    db = server.new_database(db_name2)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle2, "gb")
    # finally put it in the database
    count2 = db.load(iterator)
    server.commit()

    server.close()
    return count + count2
def check_match(input, reference, output):
    in_record = open(input)
    ref_record = open(reference)    
    fasta_list = []
    
    for in_record in SeqIO.parse(input, "fasta"): #parse input and reference seqs
        #translate to peptide seq
        orf = trans(str(in_record.seq))
        written = 0
            
        for aa_seq in orf:
            if (written == 0) and (len(aa_seq) >= 140): #shortest length of a ref seq
                for ref_record in SeqIO.parse(reference, "fasta"):
                    # pairwise alignment of input seq and each ref until a match found
                    # 1 point for match, -1 for mistmatch, -.5 for gab, -.1 for gap extension. 
                    # Can alter scoring for looser alignments
                    align = pairwise2.align.localms(aa_seq, ref_record.seq, 1, -1, -.5, -.1, score_only=True)
            
                    #scores equal to ref length (100% alignment)
                    if align == len(ref_record.seq):
                        fasta_list.append('>%s\n%s\n' % (in_record.description + " len:" + str(len(aa_seq)), aa_seq))    
                        written = 1
                        break
    
    #write query descriptions and seqs that match ref
    with open(output + ".fna", 'a') as file:
        file.write('\n'.join(fasta_list))
        file.close()      
Example #23
0
def fastaSubtract(fastaFiles):
    """
    Given a list of open file descriptors, each with FASTA content,
    remove the reads found in the 2nd, 3rd, etc files from the first file
    in the list.

    @param fastaFiles: a C{list} of FASTA filenames.
    @raises IndexError: if passed an empty list.
    @return: An iterator producing C{Bio.SeqRecord} instances suitable for
        writing to a file using C{Bio.SeqIO.write}.

    """
    reads = {}
    firstFile = fastaFiles.pop(0)
    for seq in SeqIO.parse(firstFile, 'fasta'):
        reads[seq.id] = seq

    for fastaFile in fastaFiles:
        for seq in SeqIO.parse(fastaFile, 'fasta'):
            # Make sure that reads with the same id have the same sequence.
            if seq.id in reads:
                assert str(seq.seq) == str(reads[seq.id].seq)
            reads.pop(seq.id, None)

    return iter(reads.values())
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("reads1",help='modified reads')
    parser.add_argument("reads2", help='reads to adjust')

    parser.add_argument('reads1_output',  help='output folder and filename. Note that the folder should already exist')
    parser.add_argument('reads2_output',  help='output folder and filename. Note that the folder should already exist')

    args = parser.parse_args()

    #we'll need to go through the reads1 multiple time and it can be a large file
    #so it's better to use inline func that return a generator
    _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq'))
    _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq'))

    matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2))
    synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2))

    final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1())
    final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names)

    SeqIO.write(final_reads1, args.reads1_output, "fastq")
    SeqIO.write(final_reads2, args.reads2_output, "fastq")
Example #25
0
def write_joined(ffile, rfile, joined_file, length=None):
    freads = SeqIO.parse(ffile, 'fastq')
    rreads = SeqIO.parse(rfile, 'fastq')

    with open(joined_file, 'w') as outfile:
        for fread, rread in itertools.izip(freads, rreads):
            outfile.write(join_seqs(fread, rread, length=length).format('fastq'))
Example #26
0
    def read_seqs(self, sequence_file):
        """
        read sequences from uniprot files (.dat or .fasta) or from lists or dicts of BioPython SeqRecords
        and make them available for fast search. Appending also with this function.

        :param sequence_file: uniprot files (.dat or .fasta)
        :return:
        """
        recs = sequence_file
        if not isinstance(sequence_file, dict) and not isinstance(sequence_file, list):
            try:
                with open(sequence_file, 'rb') as f:
                    if sequence_file.endswith('.fa') or sequence_file.endswith('.fasta'):
                        recs = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
                    else:  # assume it is a dat file
                        recs = SeqIO.to_dict(SeqIO.parse(open(sequence_file), 'swiss'))
            except:
                warnings.warn("Could not read file", UserWarning)
                return
        if isinstance(sequence_file, list):
            recs = SeqIO.to_dict(sequence_file)
        if recs:
            self.collection.update(recs)
            self.searchstring = '#'.join([str(x.seq) for x in self.collection.values()]).decode('ascii')
            self.accs = self.collection.keys()
            self.idx = list()
            self.idx.append(0)
            for i, v in enumerate(self.collection.values()):
                self.idx.append(1 + self.idx[-1] + len(self.collection.values()[i].seq))
        return
Example #27
0
def _open_seq(f):
	if os.path.splitext(f)[1].lower() in ['.gb', '.gbk', '.genbank', '.gen',]:
		return SeqIO.parse(f, 'genbank', alphabet=Alphabet.generic_dna)
	elif os.path.splitext(f)[1].lower() in ['.fas', '.fasta',]:
		return SeqIO.parse(f, 'fasta', alphabet=Alphabet.generic_dna)
	else:
		raise ValueError('Could not detect file type for \'{}\''.format(f))
Example #28
0
def main(argv):
    args = parseArgs()
    logging.basicConfig(level=logging.INFO, format=log_format)
    setup_mismatches(args.mismatches)

    with open(args.r1_fastq) as r1_in, \
            open(args.r2_fastq) as r2_in, \
            open(args.out_r1, 'wt') as r1_out, \
            open(args.out_r2, 'wt') as r2_out:

        r1_seqIO = SeqIO.parse(r1_in, "fastq")
        r2_seqIO = SeqIO.parse(r2_in, "fastq")
        try:
            while True:
                (r1, r2) = attach_umt(r1_seqIO.__next__(), r2_seqIO.__next__())
                # Only write Fastq records for which we find stems
                if r1 is not None and r2 is not None:
                    umtstats['trimmed'] += 1
                    r1_out.write(r1.format("fastq"))
                    r2_out.write(r2.format("fastq"))
                else:
                    umtstats['no_stem'] += 1

        except StopIteration:
            logging.info("EOF reached")

    logging.info("Trimmed: %d" % umtstats['trimmed'])
    logging.info("No stem: %d" % umtstats['no_stem'])
Example #29
0
def index_database_dbm( dbfiles, outfile, type='fasta' ):
	DBM = anydbm.open( outfile, 'c' )
	if type == 'fasta':
		for db in dbfiles:
			handle = open(db)
			for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.seq.tostring()
			handle.close()
	if type == 'description':
		for db in dbfiles:
			handle = open(db)
			for seq_record in SeqIO.parse(handle, "fasta"): 
				DBM[ seq_record.id ] = seq_record.description[ seq_record.description.index(' ')+1 : ]
			handle.close()
	if type == 'annotation':
		for db in dbfiles:
			handle = open(db)
			for seq_record in SeqIO.parse(handle, "fasta"): 
				DBM[ seq_record.id ] = seq_record.annotations
			handle.close()
	if type == 'name':
		for db in dbfiles:
			handle = open(db)
			for seq_record in SeqIO.parse(handle, "fasta"): 
				DBM[ seq_record.id ] = seq_record.name
			handle.close()
	if type == 'features':
		for db in dbfiles:
			handle = open(db)
			for seq_record in SeqIO.parse(handle, "fasta"): 
				DBM[ seq_record.id ] = seq_record.features
			handle.close()
			
			name
	DBM.close()
	return outfile
Example #30
0
def make(reads,barcodes,saveAs,name=False,mismatch=0,report=False):

    # fastools does not support pipe
    temp=tempnam(environ['HOME'])
    args=[config.get('paths','fastools'),reads,temp,mismatch]
    calls={seq.id: [] for seq in list(SeqIO.parse(reads,'fasta'))}
    counts=defaultdict(lambda: defaultdict(int))
    for barcode in SeqIO.parse(barcodes,'fasta'):

        # Fastools to check left of each sequence for barcode match
        call(cmd.format(*args+[-len(barcode),barcode.seq]).split())
        for seq in SeqIO.parse(temp,'fasta'): calls[seq.id]+=[barcode]
        remove(temp)
    with open(saveAs,'w') as handle:
        for seq in SeqIO.parse(reads,'fasta'):
            if calls[seq.id]:

                # only use best barcode
                barcode=sorted(calls[seq.id],key=lambda x: sum([
                    seq.seq.find(j,i)-i for i,j in enumerate(x)])-len(x))[0]
                seq.id+='|{0}'.format((barcode.seq,barcode.id)[name])
                counts[barcode][str(seq.seq[:len(barcode)+1])]+=1
            SeqIO.write(seq,handle,'fasta')

    if report:
        with open(path.splitext(saveAs)[0]+'.report.txt','w') as handle:
            handle.write('\n\n'.join(['\n'.join(
                ['{0}: {1}'.format(k.id,k.seq)]+
                ['{0} x {1}'.format(i,j) for i,j in sorted(v.items())])
                                      for k,v in counts.items()]))
Example #31
0
def profile_bam(bam, fasta, **kwargs):
    '''
    Return a dataframe with the complete coverage exhaustion of each scaffold

    Bdb = coverage information on all scaffolds
    Sdb = SNP information

    Return Bdb, Sdb
    '''
    # get arguments
    minP = kwargs.get('minP', .8)
    minC = kwargs.get('minC', 5)
    lightRAM = kwargs.get('lightRAM', False)

    # initialize
    table = defaultdict(list)  # set up coverage dataframe
    Atable = defaultdict(list)  # set up ANI dataframe
    Stable = defaultdict(list)  # Set up SNP table

    samfile = pysam.AlignmentFile(bam)  # set up .sam file

    scaff2sequence = SeqIO.to_dict(SeqIO.parse(fasta,
                                               "fasta"))  # set up .fasta file
    s2l = {s: len(scaff2sequence[s])
           for s in list(scaff2sequence.keys())}  # Get scaffold2length

    # initialize new goodies on the scaffold level
    if not lightRAM:
        scaff2covT = {}
        scaff2basesCounted = {}
        scaff2snpsCounted = {}

    # Iterate scaffolds
    for scaff in tqdm(s2l, desc='Scaffolds processed'):
        covT = defaultdict(lambda: np.zeros(s2l[scaff], dtype=int)
                           )  # Dictionary of mm -> positional coverage
        basesCounted = defaultdict(
            lambda: np.zeros(s2l[scaff], dtype=bool)
        )  # Count of bases that got through to SNP calling
        snpsCounted = defaultdict(
            lambda: np.zeros(s2l[scaff], dtype=bool))  # Count of SNPs

        try:
            iter = samfile.pileup(scaff)
        except ValueError:
            print("scaffold {0} is not in the .bam file {1}!".format(
                scaff, bam))
            continue

        for pileupcolumn in iter:
            # Iterate reads at this position to figure out basecounts
            # note: pileupcolumn.pos is 0-based
            MMcounts = _get_base_counts_mm(pileupcolumn)
            _update_covT(covT, MMcounts, pileupcolumn.pos)

            # Call SNPs
            _update_snp_table_T(Stable, basesCounted,\
                    snpsCounted, scaff2sequence[scaff][pileupcolumn.pos], MMcounts,\
                    pileupcolumn.pos, scaff, minC=minC, minP=minP)

        # Update coverage table
        _update_covT_table(table, covT, s2l[scaff], scaff)

        # Update ANI table
        _update_snp_covT_table(Atable, snpsCounted, basesCounted, s2l[scaff], \
                scaff, covT, minC)

        # Add to dicts
        if not lightRAM:
            scaff2covT[scaff] = dict(covT)
            scaff2basesCounted[scaff] = dict(basesCounted)
            scaff2snpsCounted[scaff] = dict(snpsCounted)

    # Make the profile
    Sprofile = SNVprofile(
        fasta_loc=fasta,
        bam_loc=bam,
        minP=minP,
        minC=minC,
        scaffold2length=s2l,
        raw_coverage_table=pd.DataFrame(table),
        raw_ANI_table=pd.DataFrame(Atable),
        raw_snp_table=_make_snp_table(Stable),
    )

    if not lightRAM:
        # Add the extra weight
        for att in ['scaff2covT', 'scaff2basesCounted', 'scaff2snpsCounted']:
            setattr(Sprofile, att, eval(att))

    # Make the tables
    Sprofile.make_cumulative_tables()

    return Sprofile
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",'--input',type=str,help="input fasta")
    parser.add_argument("-k",'--kmer',type=str,help="exact match to search")
        
    args = parser.parse_args()
    from Bio.Seq import Seq
    from Bio.Alphabet import generic_dna
    
    target_seq = Seq(args.kmer, generic_dna)

    
    from Bio import SeqIO
    handle = open(args.input, "rU")
    for record in SeqIO.parse(handle, "fasta") :
        seq = record.seq
        #import time
        #time.sleep(3)
        for i in range(0,len(seq)):
            #if i%1000000 == 0:
            #    print i
            #print seq[i:i+len(args.kmer)]
            subseq = seq[i:i+len(target_seq)]
            #print subseq
            #print args.kmer
            if str(subseq) == str(target_seq):
                print "MATCH:", i, i + len(target_seq), str(target_seq)
            elif str(subseq) ==  str(target_seq.reverse_complement()):
                print "MATCH:", i, i + len(target_seq), str(target_seq), "(rev. complement)"
            elif str(subseq) ==  str(target_seq.complement()):
    def out():
        infile1 = e1.get()
        infile2 = e2.get()
        outfile = e3.get()

        for record in SeqIO.parse(infile1, "fasta"):
            seq1 = record.seq
            seq1name = record.id

        for record in SeqIO.parse(infile2, "fasta"):
            seq2 = record.seq
            seq2name = record.id

        file = open(outfile, "w")
        #file.write("blablablabla")
        blomat = matrix
        log = type
        metadata = "#########################################\n# First Sequence Length: %s\n" \
                   "# First Sequence Name: %s\n# First Secuence File :%s\n" \
                   "# Second Sequence Length: %s\n# Second Sequence Name: %s\n" \
                   "# Second Sequence File: %s\n# Output Result File: %s\n# Program: Pairwise\n# Matrix: %s\n" \
                   "# Type: %s\n# Gap Open: %s\n# Gap Extend: %s\n# Note: Given Result May Contain Different Alignment with" \
                   "Different Scores or Different Alignment With The Same Score.\n#########################################\n\n" \
                   "" % (len(seq1),seq1name, infile1, len(seq2),seq2name,infile2,outfile, blomat, log ,gapopen,gapextend)
        file.write(metadata)
        print(blomat)
        print(log)
        if blomat == "BLOSUM 30":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum30,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum30,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
        elif blomat == "BLOSUM 35":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum35,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum35,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
        elif blomat == "BLOSUM 40":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum40,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum40,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
        elif blomat == "BLOSUM 45":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum45,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum45,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
        elif blomat == "BLOSUM 50":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum50,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum50,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 55":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum55,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum55,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 60":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum60,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum60,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 62":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum62,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum62,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 65":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum65,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum65,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 70":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum70,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum70,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 75":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum75,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum75,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 80":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum80,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum80,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 85":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum85,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum85,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 90":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum90,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum90,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 95":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum95,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum95,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)

        elif blomat == "BLOSUM 100":
            if log == "Local":
                for a in pairwise2.align.localds(seq1, seq2, blosum100,
                                                 int(gapopen), int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
            else:
                for a in pairwise2.align.globalds(seq1, seq2, blosum100,
                                                  int(gapopen),
                                                  int(gapextend)):
                    # print(format_alignment(*a))
                    alignments2 = pairwise2.format_alignment(*a)
                    print(alignments2)
                    file.write(alignments2)
        file.close()

        win.destroy()

        r = open(outfile, 'r').read()
        root = Tk()
        S = Scrollbar(root)
        T = Text(root, height=50, width=500)
        S.pack(side=RIGHT, fill=Y)
        T.pack(side=LEFT, fill=Y)
        S.config(command=T.yview)
        S.config(command=T.xview)
        T.config(yscrollcommand=S.set)
        T.config(xscrollcommand=S.set)
        quote = r
        T.insert(END, quote)
        mainloop()
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

from Bio.Restriction import Restriction
from Bio import SeqIO

#print (Restriction.Sau3AI.site)
print (Restriction.PstI.site)

records = SeqIO.parse("C:\\Users\\user\\Downloads\\test_restriction.fa", "fasta")

for i in records:
    name = i.id
    #digest = Restriction.ApeKI.catalyse(i.seq)
    hits = Restriction.PstI.search(i.seq)

    for d in hits:
        print ("{0}\t{1}\t{2}".format(name, str(d), str(d+1)))


## Ref
##        https://github.com/daler/rdbio-scripts/blob/master/sequenceFiles/restriction-finder.py
        
##        http://coco.sam.pitt.edu/~emeneses/python/lecture8.pdf   Page 20

        ##
        ## from Bio.Restriction import Restriction
        ## from Bio import SeqIO

        ## print (Restriction.Sau3AI.site)
from Bio import SeqIO

list_file = "/srv/projects3/human_plasmids/georgina/16_Host_Linkage/1_Blast/Outv2/MAGS_plasmidA_list.txt"
# open list text file
f = open(list_file, 'r')
# set mag directory
mag_dir = "/srv/projects3/human_plasmids/georgina/7_coverm/reformatted_mags/"
# set cluster directory
n = 0
for line in f.readlines():
    subject = line.split("\t")[1].strip("\n\t")
    print("Subject:", subject)
    assemblyname = "f." + subject.split("_")[0]
    print("Assembly:", assemblyname)
    assembly = list(SeqIO.parse("{}/{}".format(mag_dir, assemblyname),
                                'fasta'))
    print("Opening Assembly {} of length {}".format(assemblyname,
                                                    len(assembly)))
    new = []
    for seq_record in assembly:
        if seq_record.id != subject:
            new.append(seq_record)
        elif seq_record.id == subject:
            n = n + 1
    SeqIO.write(new, "{}/{}".format(mag_dir, assemblyname), 'fasta')
print("Done! Deleted ", n, " contigs.")
Example #36
0
def get_LFY_binding_sites(matScore, matRev, FastaFile, threshold,
                          factorTranscription):

    # This line allows to retrieve all the sequences from the fasta file
    sequences = SeqIO.to_dict(SeqIO.parse(FastaFile, "fasta"))
    print "  There are %s sequence(s) to analyze" % (len(sequences))

    list_of_the_LFY_binding_sites = []
    # We apply a loop on all the fasta sequences:
    for s in sequences:

        # We will store in this list all the best scores (see the threshold after) found for subsequences of one sequence
        good_score_positions = []

        # This line allows to retrieve the DNA sequence
        seq = sequences[s].seq
        seq_id = sequences[s].id
        chrom = re.split(':', seq_id)
        pos = re.split(':|-', seq_id)

        # We look at each sub-sequences of the whole sequence. Each sub-sequence has the same length that the matrix length.
        for c in range(len(seq) - (lenMotif - 1)):
            strandPos = seq[c:c + lenMotif].upper()
            test = 0
            for nu in strandPos:
                if nu not in ["A", "C", "G", "T"]:
                    test = 1
            if test == 1:
                score = "NA"
            else:
                #These lines allows to calculate a score for one sub-sequence
                index = 0
                scoreStrandPos = 0
                scoreStrandNeg = 0
                while index < lenMotif:
                    if strandPos[index] == 'A':
                        scoreStrandPos = scoreStrandPos + matScore[index * 4]
                        scoreStrandNeg = scoreStrandNeg + matRev[index * 4]
                    elif strandPos[index] == 'C':
                        scoreStrandPos = scoreStrandPos + matScore[index * 4 +
                                                                   1]
                        scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 1]
                    elif strandPos[index] == 'G':
                        scoreStrandPos = scoreStrandPos + matScore[index * 4 +
                                                                   2]
                        scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 2]
                    elif strandPos[index] == 'T':
                        scoreStrandPos = scoreStrandPos + matScore[index * 4 +
                                                                   3]
                        scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 3]
                    index += 1

                # This function allows to add scores that are associated with interdependent positions
                if factorTranscription == "LFY_scores_matrix_19nucl":
                    scoreStrandPos, scoreStrandNeg = add_scores_associated_with_interdependent_positions(
                        get_dependency_matrix(dependencyFile, num),
                        scoreStrandPos, scoreStrandNeg, strandPos)

                #These lines allows to retrieve the chromosome and the positions where there is a predicted binding site (score above the threshold fixed by the user) .
                if scoreStrandPos > threshold or scoreStrandNeg > threshold:
                    list_of_the_LFY_binding_sites.append([
                        chrom[0].replace('chr', ''),
                        int(pos[1]) + c + 1,
                        int(pos[1]) + c + 1 + 19,
                        str(strandPos[0:19])
                    ])

    return (list_of_the_LFY_binding_sites)
Example #37
0
import sys

from Bio import SeqIO
from Bio.SeqUtils import GC

# Argument one is a fasta file
print "GC_content"
for rec in SeqIO.parse(sys.argv[1], "fasta"):
    print "%s\t%1.2f" % (rec.id, GC(rec.seq) / 100.0)
### Input
directory_tst_ffpe = '/media/partition/fastq_tst_ffpe'
directory_tst_cll = '/media/partition/fastq_tst_cll'

fastq_tst_ffpe = [
    f for f in glob.iglob(directory_tst_ffpe + "/*R1_001.fastq.gz")
]
fastq_tst_cll = [
    f for f in glob.iglob(directory_tst_cll + "/*R1_001.fastq.gz")
]

### Open file & parse over it
for file in fastq_tst_ffpe:
    print file
    handle = gzip.open(file)
    recs = SeqIO.parse(handle, "fastq")
    d1 = defaultdict(list)
    for rec in recs:
        pos = 0
        for i, qual in enumerate(rec.letter_annotations['phred_quality']):
            d1[pos].append(qual)
            pos = pos + 1
    means = []
    mean = 0
    for key, value in d1.items():
        mean = np.mean(value)
        means.append(mean)

    x = np.arange(len(means))

    plt.plot(x, means, color='red')
Example #39
0
def read_fasta(filepath):
    records = []
    with open(filepath, 'rU') as fasta_file:
        records = list(SeqIO.parse(fasta_file, 'fasta'))
    return records
Example #40
0
def main(cpus,
         dun_make_bins=False,
         dun_use_partial=False,
         num_seqs_per_batch=100000,
         dun_cleanup_files=False):
    print "Indexing isoseq_flnc.fasta using LazyFastaReader..."
    d = LazyFastaReader('isoseq_flnc.fasta')

    print "Splitting input isoseq_flnc.fasta into seed/batches..."
    num_batchs = create_seed_n_batch_files(
        input='isoseq_flnc.fasta',
        fasta_d=d,
        seed_filename='seed0.fasta',
        batch_pre='batch',
        num_seqs_per_batch=num_seqs_per_batch)

    # step1. run minimap of seed0 against itself and process
    o = ar.run_minimap('seed0.fasta', 'seed0.fasta', cpus=cpus)
    seqids = set([r.id for r in SeqIO.parse(open('seed0.fasta'), 'fasta')])
    pCS, orphans = sp.process_self_align_into_seed(
        o, seqids, MiniReader, dun_use_partial=dun_use_partial)
    # keep stats
    size_S, size_tucked, size_orphans = len(pCS.S), sum(
        v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans)
    print "seed 0 initial: S {0}, tucked {1}, orphans {2}".format(
        size_S, size_tucked, size_orphans)

    # write out seed1.S.fasta and seed1.orphans.fasta
    FileIO.write_preClusterSet_to_fasta(pCS, 'seed1.S.fasta', d)
    FileIO.write_seqids_to_fasta(orphans, 'seed1.orphans.fasta', d)
    # step 2a. minimap batch1 against seed1.S and process

    for i in xrange(1, num_batchs):
        pCS, orphans = add_batch(i,
                                 pCS,
                                 orphans,
                                 d,
                                 cpus=cpus,
                                 dun_use_partial=dun_use_partial)
        cleanup_precluster_intermediate_files(i)

    # detect PCR chimeras from orphans
    chimeras = detect_PCR_chimeras(orphans, d)
    orphans = orphans.difference(chimeras)

    FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d)
    FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d)

    tucked_seqids = []
    # dump pCS, orphans, chimeras to a pickle
    # can't dump yet --- since pCS is an object
    #with open('preCluster.output.pickle', 'w') as f:
    #    dump({'pCS': pCS, 'chimeras': chimeras, 'orphans': orphans}, f)
    # write CSV file
    with open('preCluster.output.csv', 'w') as f:
        f.write("seqid,stat\n")
        for x, stat in pCS.seq_stat.iteritems():
            if stat == 'T':
                f.write("{0},tucked\n".format(x))
                tucked_seqids.append(x)
            elif stat == 'M':
                f.write("{0},{1}\n".format(x, pCS.seq_map[x]))
        for x in orphans:
            f.write("{0},orphan\n".format(x))
        for x in chimeras:
            f.write("{0},chimera\n".format(x))

    # Liz: currently not using tucked...
    if len(tucked_seqids) > 0:
        FileIO.write_seqids_to_fasta(tucked_seqids,
                                     "preCluster_out.tucked.fasta", d)

    infof = open('preCluster.cluster_info.csv', 'w')
    infof.write("cluster,size\n")
    # write out a directory per preCluster cid in preCluster_out/<cid>
    # Liz note: right now, write out even directories with just 1 sequence
    # (we know they have "tucked" support, so can run Partial/Arrow on it)
    #singlef = open("preCluster_out.singles.fasta", 'w')
    for cid in pCS.S:
        #    if pCS.S[cid].size == 1:
        #        r = d[pCS.S[cid].members[0]]
        #        singlef.write(">{0}\n{1}\n".format(r.id, r.seq))
        #    else:
        if True:
            if not dun_make_bins:
                dirname = os.path.join("preCluster_out", str(cid))
                os.makedirs(dirname)
                file = os.path.join(dirname, 'isoseq_flnc.fasta')
                FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d)
            infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members)))
    #singlef.close()
    infof.close()

    if not dun_cleanup_files:  # clean up all seed* and batch* files
        for file in glob.glob('batch*fasta*'):
            os.remove(file)
        for file in glob.glob('seed*fasta*'):
            os.remove(file)
Example #41
0
def create_flats_and_lisp(run_folder, taxon_file):
    """
    Read Genbank/GFF/PF files and create Pathway Tools needed file.
    Create also a lisp file to create flat files from Pathway tools results.
    The name of the PGDB created by Pathway Tools will be the name of the species with '_' instead of space.

    Create organism-params.dat:
    ID  pgdb_id
    STORAGE FILE
    NCBI-TAXON-ID   taxon_id
    NAME    species_name

    Create genetic-elements.dats:
    NAME    
    ANNOT-FILE  gbk_name
    //

    Create flat_files_creation.lisp:
    (in-package :ecocyc)
    (select-organism :org-id 'pgdb_id)
    (create-flat-files-for-current-kb)

    Args:
        run_folder (str): ID of a species of the input folder
        taxon_file (bool): Boolean indicating if a taxon_file must be used
    Returns:
        list: boolean list, True if all files have been created
   """
    # Look for a Genbank/GFF files in the run folder.
    # PGDB ID corresponds to the name of the species folder.
    pgdb_id = os.path.basename(run_folder)
    gbk_name = pgdb_id + ".gbk"
    gbk_pathname = os.path.join(run_folder, gbk_name)
    gbff_name = pgdb_id + ".gbff"
    gbff_pathname = os.path.join(run_folder, gbff_name)
    gff_name = pgdb_id + ".gff"
    gff_pathname = os.path.join(run_folder, gff_name)

    organism_dat = os.path.join(run_folder, 'organism-params.dat')
    genetic_dat = os.path.join(run_folder, 'genetic-elements.dat')
    lisp_pathname = os.path.join(run_folder, 'flat_files_creation.lisp')

    fasta_extensions = ['.fasta', '.fsa']

    taxon_id = ""
    taxon_error = False
    species_name = ""
    taxon_datas = {}

    if os.path.isfile(gbk_pathname) or os.path.isfile(gbff_pathname):
        if os.path.isfile(gbk_pathname):
            input_name = gbk_name
            input_path = gbk_pathname
        else:
            input_name = gbff_name
            input_path = gbff_pathname
        # Take the species name and the taxon id from the genbank file.
        with open(input_path, "r") as gbk:
            # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
            try:
                first_seq_record = next(SeqIO.parse(gbk, "genbank"))
            except StopIteration:
                logger.critical(
                    'Issue with the genbank {0}, it can be empty or malformatted.'
                    .format(input_path))
                return None

            try:
                species_name = first_seq_record.annotations['organism']
            except KeyError:
                logger.critical(
                    'No organism in the Genbank {0} In the SOURCE you must have: ORGANISM  Species name'
                    .format(pgdb_id))
                return None

            # Take the source feature of the first record.
            # This feature contains the taxon ID in the db_xref qualifier.
            src_features = [
                feature for feature in first_seq_record.features
                if feature.type == "source"
            ]
            for src_feature in src_features:
                if 'db_xref' in src_feature.qualifiers:
                    src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
                    for src_dbxref_qualifier in src_dbxref_qualifiers:
                        if 'taxon:' in src_dbxref_qualifier:
                            taxon_id = src_dbxref_qualifier.replace(
                                'taxon:', '')
                if not taxon_id:
                    logger.info(
                        'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'
                        .format(gbk_pathname))
                    logger.info('Try to look in the taxon_id.tsv file')
                    taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                        run_folder, pgdb_id, taxon_id, taxon_file)
            if taxon_file:
                taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                    run_folder, pgdb_id, taxon_id, taxon_file)

    elif os.path.isfile(gff_pathname):
        input_name = gff_name
        # Check if there is a fasta file.
        gff_fasta = None
        for fasta_extension in fasta_extensions:
            fasta_input_name = input_name.replace('.gff', fasta_extension)
            fasta_path = os.path.join(run_folder, fasta_input_name)
            if os.path.exists(fasta_path):
                gff_fasta = fasta_input_name
        if not gff_fasta:
            logger.critical(
                'No fasta file (.fasta or .fsa) with the GFF of {0}'.format(
                    pgdb_id))
            return None

        # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
        try:
            region_feature = [
                feature for feature in DataIterator(gff_pathname)
                if feature.featuretype == 'region'
            ][0]
        except IndexError:
            logger.critical(
                'No region feature in the GFF file of {0}, GFF file must have region features.'
                .format(pgdb_id))
            return None

        try:
            region_feature.attributes['Dbxref']
        except KeyError:
            logger.critical(
                'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                .format(pgdb_id))

        for dbxref in region_feature.attributes['Dbxref']:
            if 'taxon' in dbxref:
                taxon_id = dbxref.split('taxon:')[1]
        if not taxon_id or taxon_file:
            if not taxon_id:
                logger.info(
                    'Missing "taxon:" in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                    .format(pgdb_id))
                logger.info('Try to look in the taxon_id.tsv file')
            taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                run_folder, pgdb_id, taxon_id, taxon_file)

    # Look for PF files.
    elif all([
            True for species_file in os.listdir(run_folder)
            if '.pf' in species_file or '.fasta' in species_file
            or '.fsa' in species_file
    ]):
        for species_file in os.listdir(run_folder):
            if '.pf' in species_file:
                # Check if there is a fasta file.
                pf_fasta = None
                for fasta_extension in fasta_extensions:
                    fasta_species_name = species_file.replace(
                        '.pf', fasta_extension)
                    fasta_path = os.path.join(run_folder, fasta_species_name)
                    if os.path.exists(fasta_path):
                        pf_fasta = fasta_species_name
                if not pf_fasta:
                    logger.critical(
                        'No fasta file (.fasta or .fsa) with the Pathologic file of {0}, this could lead to warnings in Pathway Tools.'
                        .format(pgdb_id))

        taxon_error, taxon_id, taxon_datas = extract_taxon_id(
            run_folder, pgdb_id, taxon_id, taxon_file)

    if taxon_error == True:
        logger.critical('Issue with taxon ID of {0}.'.format(run_folder))
        return None

    # Create the organism-params dat file.
    with open(organism_dat, 'w', encoding='utf-8') as organism_file:
        organism_writer = csv.writer(organism_file,
                                     delimiter='\t',
                                     lineterminator='\n')
        organism_writer.writerow(['ID', pgdb_id])
        organism_writer.writerow(['STORAGE', "FILE"])
        organism_writer.writerow(['NCBI-TAXON-ID', taxon_id])
        organism_writer.writerow(['NAME', species_name])
        if 'reference_pgdbs' in taxon_datas:
            for reference_pgdb in taxon_datas['reference_pgdbs']:
                organism_writer.writerow(['REF-ORGID', reference_pgdb])

    # Create the genetic-elements dat file.
    with open(genetic_dat, 'w', encoding='utf-8') as genetic_file:
        if os.path.isfile(gff_pathname) or os.path.isfile(
                gbk_pathname) or os.path.isfile(gbff_pathname):
            genetic_writer = csv.writer(genetic_file,
                                        delimiter='\t',
                                        lineterminator='\n')
            genetic_writer.writerow(['NAME', ''])
            genetic_writer.writerow(['ANNOT-FILE', input_name])
            if os.path.isfile(gff_pathname):
                genetic_writer.writerow(['SEQ-FILE', gff_fasta])
            if 'circular' in taxon_datas:
                circular = taxon_datas['circular']
                genetic_writer.writerow(['CIRCULAR?', circular])
            if 'element_type' in taxon_datas:
                element_type = taxon_datas['element_type']
                genetic_writer.writerow(['TYPE', element_type])
            if 'codon_table' in taxon_datas:
                codon_table = taxon_datas['codon_table']
                genetic_writer.writerow(['CODON-TABLE', codon_table])
            genetic_writer.writerow(['//'])
        elif all([
                True for species_file in os.listdir(run_folder)
                if '.pf' in species_file or '.fasta' in species_file
                or '.fsa' in species_file
        ]):
            genetic_writer = csv.writer(genetic_file,
                                        delimiter='\t',
                                        lineterminator='\n')
            for species_file in os.listdir(run_folder):
                if '.pf' in species_file:
                    species_file_name = os.path.splitext(species_file)[0]
                    genetic_writer.writerow(
                        ['NAME', species_file.replace('.pf', '')])
                    genetic_writer.writerow(
                        ['ID', species_file.replace('.pf', '')])
                    genetic_writer.writerow(['ANNOT-FILE', species_file])
                    fasta_path = os.path.join(
                        run_folder, species_file.replace('.pf', '.fasta'))
                    fsa_path = os.path.join(
                        run_folder, species_file.replace('.pf', '.fsa'))
                    if os.path.exists(fasta_path):
                        genetic_writer.writerow([
                            'SEQ-FILE',
                            species_file.replace('.pf', '.fasta')
                        ])
                    elif os.path.exists(fsa_path):
                        genetic_writer.writerow(
                            ['SEQ-FILE',
                             species_file.replace('.pf', '.fsa')])

                    if species_file_name in taxon_datas:
                        if 'circular' in taxon_datas[species_file_name]:
                            circular = taxon_datas[species_file_name][
                                'circular']
                            genetic_writer.writerow(['CIRCULAR?', circular])
                        if 'element_type' in taxon_datas[species_file_name]:
                            element_type = taxon_datas[species_file_name][
                                'element_type']
                            genetic_writer.writerow(['TYPE', element_type])
                        if 'codon_table' in taxon_datas[species_file_name]:
                            codon_table = taxon_datas[species_file_name][
                                'codon_table']
                            genetic_writer.writerow(
                                ['CODON-TABLE', codon_table])
                    else:
                        if 'circular' in taxon_datas:
                            circular = taxon_datas['circular']
                            genetic_writer.writerow(['CIRCULAR?', circular])
                        if 'element_type' in taxon_datas:
                            element_type = taxon_datas['element_type']
                            genetic_writer.writerow(['TYPE', element_type])
                        if 'codon_table' in taxon_datas:
                            codon_table = taxon_datas['codon_table']
                            genetic_writer.writerow(
                                ['CODON-TABLE', codon_table])
                    genetic_writer.writerow(['//'])
    # Create the lisp script.
    check_lisp_file = create_flat_creation_script(pgdb_id, lisp_pathname)

    return all([
        os.path.isfile(organism_dat),
        os.path.isfile(genetic_dat), check_lisp_file
    ])
    def Analyze(self):

        print "Loading results ..."
        bar = progressbar.ProgressBar()
        results = []
        for i in bar(range(self.AlLen - self.window)):
            # print "Loading %s/%s" % (i, self.AlLen - self.window)
            df = self.Parse_HMMER_output('%s_%s.out' % (self.name, i))
            res = self.Extract_kingdoms(df)
            results.append(res)

        print "Compute ratio"
        # save ratio in array
        df = []
        if os.path.isfile(self.name + '_database.fasta'):
            res = {'B': 0, 'E': 0, 'A': 0, 'O': 0, 'Other': 0}
            tot = 0
            seqs = SeqIO.parse(self.name + '_database.fasta', format='fasta')
            for i in seqs:
                specID = i.name.split('|')[2].split('_')[1]
                kg = self.Get_kingdom(specID, i.description, i.name.split('|')[1])
                if kg in res.keys():
                    res[kg] += 1
                else:
                    res['Other'] += 1
                tot += 1
            if self.osk:
                df.append(['db', res['B'], res['E'], res['A'], res['O'], res['Other'], 0])
            else:
                df.append(['db', res['B'], res['E'], res['A'], res['Other'], 0])
        counts = []
        j = 0
        for i in results:
            tmp = []
            tmp.append(i.count('B'))
            tmp.append(i.count('E'))
            tmp.append(i.count('A'))
            if self.osk:
                tmp.append(i.count('O'))
                tmp.append(len(i) - (i.count('B') + i.count('A') + i.count('E') + i.count('O')))
            else:
                tmp.append(len(i) - (i.count('B') + i.count('A') + i.count('E')))
            counts.append(tmp)
            p = 1 - self.alignment[:, j].count('-') / float(len(self.alignment[:, j]))
            df.append([j] + tmp + [p])
            j += 1
        counts = np.array(counts).T
        if self.osk:
            df = pd.DataFrame(df, columns=['Position', 'Bacteria', 'Eukaryotes', 'Arthropods', 'Oskar', 'Other', 'Occupancy'])
        else:
            df = pd.DataFrame(df, columns=['Position', 'Bacteria', 'Eukaryotes', 'Arthropods', 'Other', 'Occupancy'])
        df.to_csv(self.name + '.csv', index=False)
        # return results, counts
        # plot ratio
        # sns.
        # http://matplotlib.org/examples/pylab_examples/stackplot_demo.html
        print "Plotting results"
        fig, ax = plt.subplots()
        if self.osk:
            t = ax.stackplot(range(len(counts[0])), counts[0], counts[1], counts[2], counts[3], counts[4])
            leg = ['Bacteria', 'Eukaryotes', 'Arthropoda', 'Oskar', 'Other']
        else:
            t = ax.stackplot(range(len(counts[0])), counts[0], counts[1], counts[2], counts[3])
            leg = ['Bacteria', 'Eukaryotes', 'Arthropoda', 'Other']
        handles = []
        for i in range(len(t)):
            handles.append(mpatches.Patch(color=t[i].get_facecolor()[0], label=leg[i]))
        ax.legend(handles=handles)
        plt.title(self.name)
        fig.savefig("%s.pdf" % self.name)
        fig.savefig("%s.png" % self.name)
Example #43
0
from Bio import SeqIO
from math import floor
from sys import argv

# read files
region_file = argv[1]
results_file = argv[2]
ref_file = argv[3]
alignment_file = argv[4]

regions = pd.read_csv(region_file)
df = pd.read_csv(results_file)
new_df = pd.DataFrame(columns=["sample", "mutations", "aa_mut"])
# read sequences
reference = list(SeqIO.read(ref_file, 'fasta').seq)
alignment = SeqIO.to_dict(SeqIO.parse(alignment_file, 'fasta'))
for sample, record in alignment.items():
    alignment[sample] = list(str(record.seq).upper())

translate_table = {
    'ATA': 'I',
    'ATC': 'I',
    'ATT': 'I',
    'ATG': 'M',
    'ACA': 'T',
    'ACC': 'T',
    'ACG': 'T',
    'ACT': 'T',
    'AAC': 'N',
    'AAT': 'N',
    'AAA': 'K',
Example #44
0
def read_taxon_id(run_folder):
    """
    Search for Taxon ID in genbank or GFF files.
    For GenBank file searc for ''taxon:' key in 'db_xref' qualifier.
    For GFF file search for 'taxon' in dbxref feature.

    Args:
        run_folder (str): path to the input folder
    """
    taxon_ids = {}

    for input_folder in os.listdir(run_folder):
        input_folder_path = os.path.join(run_folder, input_folder)
        for input_file in os.listdir(input_folder_path):
            if '.gbk' in input_file:
                gbk_pathname = os.path.join(input_folder_path, input_file)
                # Take the species name and the taxon id from the genbank file.
                with open(gbk_pathname, "r") as gbk:
                    # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
                    first_seq_record = next(SeqIO.parse(gbk, "genbank"))
                    # Take the source feature of the first record.
                    # This feature contains the taxon ID in the db_xref qualifier.
                    src_features = [
                        feature for feature in first_seq_record.features
                        if feature.type == "source"
                    ]
                    for src_feature in src_features:
                        try:
                            src_dbxref_qualifiers = src_feature.qualifiers[
                                'db_xref']
                            for src_dbxref_qualifier in src_dbxref_qualifiers:
                                if 'taxon:' in src_dbxref_qualifier:
                                    taxon_id = src_dbxref_qualifier.replace(
                                        'taxon:', '')
                        except KeyError:
                            logger.info(
                                'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'
                                .format(gbk_pathname))

            elif '.gff' in input_file:
                gff_pathname = os.path.join(input_folder_path, input_file)

                # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
                try:
                    region_feature = [
                        feature for feature in DataIterator(gff_pathname)
                        if feature.featuretype == 'region'
                    ][0]
                except IndexError:
                    raise IndexError(
                        'No region feature in the GFF file of {0}, GFF file must have region features.'
                        .format(input_folder))

                try:
                    region_feature.attributes['Dbxref']
                except KeyError:
                    raise KeyError(
                        'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                        .format(input_folder))

                for dbxref in region_feature.attributes['Dbxref']:
                    if 'taxon' in dbxref:
                        taxon_id = dbxref.split('taxon:')[1]

            elif '.pf' in input_file:
                logger.info(
                    'No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'
                    .format(input_folder))
                taxon_id = "missing"
        taxon_ids[input_folder] = taxon_id

    return taxon_ids
Example #45
0
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.SeqUtils import GC
from operator import itemgetter, attrgetter, methodcaller

high_id = None
high_gc = 0

for record in SeqIO.parse("dataset/rosalind_gc.fasta", "fasta", generic_dna):
    gc = GC(record.seq)
    if gc > high_gc:
        high_id = record.id
        high_gc = gc

print(high_id)
print('{:.3f}'.format(high_gc))
Example #46
0
if __name__ == '__main__':
    import re
    from Bio import SeqIO
    from common import file_parser
    import os

    parser = file_parser(prog_desc='Find sequences that contain KLEEKS', file_desc='A FASTA file containing protein sequences')
    args = parser.parse_args()

    
    kleek_filter = re.compile(r'KL[EI]{2,}K')
    hits = 0
    with open(args.file, 'rU') as fd:
        for record in SeqIO.parse(fd, 'fasta'):
            if re.search(kleek_filter, str(record.seq)):
                hits += 1
                print(record.format('fasta'))
    # print(hits)

Example #47
0
from Bio import SeqIO
import gzip

with open("tmpxcmM8P/splitted_temp.fastq", "rU") as splitted:
    original=SeqIO.parse(gzip.open("SRR027957_2.fastq.gz", "rb"), "fastq")
    for record in SeqIO.parse(splitted, "fastq"):
        orecord=original.next()

Example #48
0
			track.add_set(fs)
			diag.add_track(track, 1)
                        pglen = float(clust[k]['end'] - clust[k]['start']) / float(1000)
			diag.draw(
				format = "linear",
				orientation = "landscape",
				pagesize = (pglen*cm, 5*cm),
				fragments = 1,
				start = clust[k]['start'],
				end= clust[k]['end']
			)
			diag.write('.'.join([pref, k, 'svg']), "SVG")	

## Clustmine pipeline
if(gbt == 'clustmine'):
	for rec in SeqIO.parse(gb, "genbank"):
		clust = {}
		cur = ''
		for feat in rec.features:
			if(feat.type == "cluster"):
				cur = feat.qualifiers['name'][0]
				clust[cur] = {
					'start':	feat.location.start,
					'end':		feat.location.end,
					'gene':		{}
				}
			elif(feat.type == "gene"):
				if(feat.location.start > clust[cur]['start'] and feat.location.end < clust[cur]['end']):
					clust[cur]['gene'][ feat.qualifiers['name'][0] ] = feat
		drawsvg(clust)
 def get_hits_number_helper(self, fasta_file):
     """Parse fasta file and find out the number of sequences"""
     seq_length = [
         sequence.id for sequence in SeqIO.parse(fasta_file, 'fasta')
     ]
     return len(seq_length)
Example #50
0
        if sequence == database['Sequence'][i]:
            index = i
            break

    return index


database = pd.read_csv(sys.argv[1])
fasta_LAMP = sys.argv[2]

path_output = sys.argv[3]

#read doc
sequences_data = []

for record in SeqIO.parse(fasta_LAMP, "fasta"):
    sequences_data.append(record.seq)

sequences_data = list(set(sequences_data))
sequences_not_DB = []

for sequence in sequences_data:
    print("Process sequence: ", sequence)
    index = search_sequences_into_db(database, sequence)
    if index == -1:
        sequences_not_DB.append(sequence)

#export sequence not DB to fasta file
fasta_export = open(path_output + "sequences_not_DB.fasta", 'w')

print(len(sequences_not_DB))
Example #51
0
#!/home/incerta/Metagenomics/bin/Programs/Miniconda/envs/python2.7/bin/python
#!python
# Written for biopython under python2.7
# V0.01 Written by Gisle Vestergaard ([email protected])
from Bio import SeqIO
import argparse

parser = argparse.ArgumentParser(
    description="Split the fasta file into individual file with each gene seq")
parser.add_argument('-f',
                    action='store',
                    dest='fasta_file',
                    help='Input fasta file')
result = parser.parse_args()

f_open = open(result.fasta_file, "rU")

for rec in SeqIO.parse(f_open, "fasta"):
    id = rec.id
    seq = rec.seq
    id_file = open(id + ".fasta", "w")
    id_file.write(">" + str(id) + "\n" + str(seq))
    id_file.close()

f_open.close()
Example #52
0
def load_ref(ref_g):
	logging.info("Loading reference genome...")
	return SeqIO.to_dict(SeqIO.parse(ref_g, "fasta"))
searched_genes = [
    "SAOUHSC_01852", "SAOUHSC_01482", "SAOUHSC_00832", "SAOUHSC_01699",
    "SAOUHSC_01635", "SAOUHSC_01481", "SAOUHSC_01483"
]

input_file = "Aureus.gb"  #Your GenBank file locataion. e.g C:\\Sequences\\my_genbank.gb
output_file_name = "try.fasta"  #The name out your fasta output
accession_numbers = [
    line.strip() for line in open('Aureus.gb')
]  #the same as your input file, defines the headers for each sequence

if not os.path.exists(
        output_file_name
):  #checks for a pre-existing file with the same name as the output
    for rec in SeqIO.parse(
            input_file, "gb"
    ):  #calls the record for the genbank file and SeqIO (BioPython module) to parse it
        acc = rec.annotations['accessions'][0]  #Defines your accession numbers
        organism = rec.annotations['organism']  #defines your organism
        tax_line = ("| ").join(
            rec.annotations['taxonomy']
        )  #defines your taxonomy and seperates entries with a |, remove this line, the 'tax_line', and the {2} in your save for a simpler output
        for i in range(len(searched_genes)):
            for feature in rec.features:  #looks for features in the genbank
                if feature.type == 'CDS':
                    for key, val in feature.qualifiers.items(
                    ):  #looks for val in the feature qualifiers
                        if any(
                                searched_genes[i] in s for s in val
                        ):  #Finds all the CDS which contain the word "protein" in the qualifiers. Change to 'if "Name" in val:' for protein called "name" exactly
                            with open(
Example #54
0
import sys, os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

if len(sys.argv) < 4:
    print(
        "Usage: python combine_alignments.py exon.fasta intron.fasta[or any value if no intron] geneName"
    )
    sys.exit(1)

exon_fn = sys.argv[1]
intron_fn = sys.argv[2]
geneName = sys.argv[3]

exon_dict = SeqIO.to_dict(SeqIO.parse(exon_fn, 'fasta'))
exonLength = len(next(exon_dict.itervalues()))

if os.path.isfile(intron_fn):
    with open("{}.combined.fasta".format(geneName), 'w') as outfile:
        for seq in SeqIO.parse(intron_fn, 'fasta'):
            intronLength = len(seq)
            sampleID = seq.id.split("-")[0]
            newseq = exon_dict[sampleID].seq + seq.seq
            outfile.write(">{}\n{}\n".format(sampleID, newseq))
        partition = """
begin sets;
charset codon1 = 1-{}\\3;
charset codon2 = 2-{}\\3;
charset codon3 = 3-{}\\3;
charset intron = {}-{};
Example #55
0
def removeAntiSense(input, readTuple, output):
    '''
    function will map reads to the input transcripts, determine strandedness, and then filter
    out transcripts that were assembled in antisense orientation. idea here is that the antisense
    transcripts, while potentially valid, aren't going to help update the gene models and perhaps
    could hurt the annotation effort?
    '''
    lib.log.info("Running anti-sense filtering of Trinity transcripts")
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    aligner = choose_aligner()
    if aligner == 'hisat2':
        bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Hisat2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'hisat2-build', input,
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

            #now launch the aligner
            lib.log.info("Aligning reads to trinity transcripts with Hisat2")
            hisat2cmd = [
                'hisat2', '-p',
                str(args.cpus), '-k', '50', '--max-intronlen',
                str(args.max_intronlen), '-x',
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            if readTuple[2]:
                hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                hisat2cmd = hisat2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(hisat2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'bowtie2':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir,
                                  'bowtie2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Bowtie2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'bowtie2-build', input,
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with Bowtie2")
            bowtie2cmd = [
                'bowtie2', '-p',
                str(args.cpus), '-k', '50', '--local', '--no-unal', '-x',
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            if readTuple[2]:
                bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                bowtie2cmd = bowtie2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(bowtie2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'rapmap':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building RapMap index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'rapmap', 'quasiindex', '-t', input, '-i',
                os.path.join(tmpdir, 'rapmap_index')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with RapMap")
            rapmapcmd = [
                'rapmap', 'quasimap', '-t',
                str(args.cpus), '-i',
                os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2',
                readTuple[1]
            ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(rapmapcmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess(cmd, '.', lib.log)

    #now run Trinity examine strandeness tool
    lib.log.info("Examining strand specificity")
    cmd = [
        os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'),
        bowtie2bam,
        os.path.join(tmpdir, 'strand_specific')
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #parse output dat file and get list of transcripts to remove
    removeList = []
    with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile:
        for line in infile:
            line = line.replace('\n', '')
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if args.stranded == 'RF':  #then we want to keep negative ratios in cols[4]
                if not cols[4].startswith('-'):
                    removeList.append(cols[0])
            elif args.stranded == 'FR':  #keep + values
                if cols[4].startswith('-'):
                    removeList.append(cols[0])

    #now parse the input fasta file removing records in list
    with open(output, 'w') as outfile:
        with open(input, 'rU') as infile:
            for record in SeqIO.parse(infile, 'fasta'):
                if not record.id in removeList:
                    outfile.write(">%s\n%s\n" %
                                  (record.description, str(record.seq)))
    lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
Example #56
0
from Bio import SeqIO
import sys

if len(sys.argv)!= 5:
    print("Usage: python simple_trim.py <input_file> <output_file> <start_position> <end_position>")

else:

    input_file = sys.argv[1]
    output_file = sys.argv[2]
    start = int(sys.argv[3])
    end = int(sys.argv[4])

    for seq_record in SeqIO.parse(input_file, "fasta"):
        my_seq = seq_record.seq

        trimmed_seq = str(my_seq[start:end])

        my_header = seq_record.id

        out = open(output_file, "w")
        out.write(">" + my_header + "\n")
        out.write(trimmed_seq)
        out.close()
def fasta_id(fastafile):
    fastaid = defaultdict(str)
    for record in SeqIO.parse(fastafile,"fasta"):
        fastaid[record.id] = 1
    return fastaid
Example #58
0
def postprocess_chains(assembly, reads_real_coords):
    rare_kmers = get_kmers(assembly.kmers_fname)
    unique_kmers = get_kmers(assembly.solid_kmers_fname)

    rare_kmers_by_pos = []
    unique_kmers_by_pos = []
    assembly_id = ""
    with open(assembly.fname) as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            assembly_len = len(record.seq)
            assembly_seq = str(record.seq)
            assembly_id = str(record.id)
            '''rare_kmers_by_pos = [0] * assembly_len
            unique_kmers_by_pos = [0] * assembly_len
            for i in range(len(assembly_seq) - KMER_SIZE + 1):
                kmer = assembly_seq[i:i + KMER_SIZE]
                if kmer in rare_kmers or rev_comp(kmer) in rare_kmers:
                    rare_kmers_by_pos[i] = 1
                if kmer in unique_kmers or rev_comp(kmer) in unique_kmers:
                    unique_kmers_by_pos[i] = 1'''

    read_alignments = defaultdict(list)
    read_lengths = dict()
    read_seeds = defaultdict(lambda: defaultdict(list))
    with open(assembly.chains_fname) as f:
        #Aln     -a3aa51bb-4cZ 4 23715 24082 +cenX 1894557 1918322 3053970 -1894573 1135315 140 0.27543
        for line in f:
            fs = line.split()
            if "Aln" in line and len(fs) >= 8:
                read_name, align_start, align_end, read_len, \
                ref_name, ref_start, ref_end, ref_len = fs[1], fs[2], fs[3], fs[4], fs[5], fs[6], fs[7], fs[8]
                align_start, align_end, read_len, ref_start, ref_end, ref_len = map(
                    int, (align_start, align_end, read_len, ref_start, ref_end,
                          ref_len))
                seq_div = float(fs[-1])
                read_lengths[read_name[1:]] = read_len
                if read_name.startswith('-'):
                    align_start, align_end = read_len - align_end - 1, read_len - align_start - 1
                if ref_name.startswith('-'):
                    ref_start, ref_end = ref_len - ref_end - 1, ref_len - ref_start - 1
                read_alignments[read_name[1:]].append(
                    (ref_start, ref_end, align_start, align_end, seq_div))
            elif len(fs) >= 2:
                read_pos, ref_pos = int(fs[0]), int(fs[1])
                if read_name.startswith('-'):
                    read_pos = read_len - read_pos - KMER_SIZE
                if ref_name.startswith('-'):
                    ref_pos = ref_len - ref_pos - KMER_SIZE
                read_seeds[read_name[1:]][(ref_start, ref_end, align_start,
                                           align_end, seq_div)].append(
                                               (read_pos, ref_pos))

    num_alignments = 0
    all_errors = []
    with open(assembly.bed_fname, "w") as f:
        for read_name, aligns in read_alignments.items():
            max_kmers = 0
            max_len = 0
            selected_chain = []
            selected_errors = []
            for align in aligns:
                seeds = read_seeds[read_name][align]
                seeds.sort(key=lambda x: x[1])
                best_chain = None
                best_kmers = 0
                best_len = 0
                best_errors = []
                cur_errors = []
                if len(seeds) >= MIN_CHAIN_KMERS:
                    prev_pos = 0
                    unique_seeds = []
                    for seed in seeds:
                        read_pos, ref_pos = seed
                        if ref_pos - prev_pos >= KMER_SIZE or not unique_seeds:
                            unique_seeds.append((read_pos, ref_pos))
                            prev_pos = ref_pos
                    unique_seeds.append(seeds[-1])
                    # seeds = unique_seeds
                    new_chains = []
                    breakpoints = []
                    if assembly.real_coords:
                        for i, s in enumerate(seeds):
                            seeds[i] = (
                                reads_real_coords[read_name][s[0]],
                                assembly.real_coords[assembly_id][s[1]])

                    for i in range(1, len(seeds)):
                        ref_diff = abs(seeds[i][1] - seeds[i - 1][1])
                        read_diff = abs(seeds[i][0] - seeds[i - 1][0])
                        max_diff = max(100, min(ref_diff, read_diff) * 0.05)
                        if abs(
                                ref_diff - read_diff
                        ) >= max_diff:  # and abs(ref_diff-read_diff) >= 5000:
                            prev_bp = i - 1
                            breakpoints.append(i - 1)
                            cur_errors.append(
                                (seeds[i - 1][1], seeds[i][1], read_name,
                                 ref_diff - read_diff))
                        elif seeds[i][1] - seeds[i - 1][1] >= MAX_REF_GAP:
                            gap_s, gap_e = seeds[i -
                                                 1][1] + KMER_SIZE, seeds[i][1]
                            if sum(rare_kmers_by_pos[gap_s:gap_e]
                                   ) > MAX_MISSED_KMERS:
                                breakpoints.append(i - 1)
                                prev_bp = i - 1
                    if breakpoints:
                        chain_start1, chain_end1, chain_start2, chain_end2 = seeds[
                            0][1], seeds[-1][1], seeds[0][0], seeds[-1][0]
                        start_n = 0
                        for p in breakpoints:
                            chain_end1, chain_end2 = seeds[p][1], seeds[p][0]
                            new_chains.append(
                                (chain_start1, chain_end1, chain_start2,
                                 chain_end2, p - start_n + 1))
                            if p < len(seeds):
                                chain_start1, chain_start2, start_n = seeds[
                                    p + 1][1], seeds[p + 1][0], p + 1

                        chain_end1, chain_end2 = seeds[-1][1], seeds[-1][0]
                        if chain_end1 > chain_start1:
                            new_chains.append(
                                (chain_start1, chain_end1, chain_start2,
                                 chain_end2, len(seeds) - p))
                        chains = []
                        total_kmers = 0
                        total_len = 0
                        for c in new_chains:
                            chain_start1, chain_end1, chain_start2, chain_end2, chain_kmers = c
                            if chain_kmers > MIN_CHAIN_KMERS and chain_end1 - chain_start1 >= MIN_CHAIN_LEN:
                                chains.append((chain_start1, chain_end1,
                                               chain_start2, chain_end2))
                                total_kmers += chain_kmers
                                total_len += chain_end1 - chain_start1
                        if total_kmers > best_kmers:
                            best_kmers = total_kmers
                            best_len = total_len
                            best_chain = chains
                            best_errors = cur_errors
                    else:
                        best_kmers = len(
                            seeds) if len(seeds) > MIN_CHAIN_KMERS / 2 else 0
                        best_len = seeds[-1][1] - seeds[0][1]
                        best_chain = [[
                            seeds[0][1], seeds[-1][1], seeds[0][0],
                            seeds[-1][0]
                        ]]
                        best_errors = []

                    if best_len < 100:
                        continue
                    if best_kmers > max_kmers:
                        max_kmers = best_kmers
                        max_len = best_len
                        selected_chain = best_chain
                        selected_errors = best_errors

            for c in selected_chain:
                ref_start, ref_end, align_start, align_end = c
                if (ref_end - ref_start) < MIN_CHAIN_LEN:
                    continue
                num_alignments += 1
                f.write("seq\t%d\t%d\t%s\t%d\t%d\t%d\n" %
                        (ref_start, ref_end, slugify(read_name), align_start,
                         align_end, read_lengths[read_name]))
            all_errors.extend(selected_errors)

    print("  Total %d alignments" % num_alignments)
    print("  Longest chains saved to %s" % assembly.bed_fname)
    return all_errors
		 "CO2" : 'COX2', 'ATP8': "ATP8", 'ATP6': "ATP6", 
		 "CO3" : 'COX3', 'ND3': "ND3", "ND4L" : 'ND4L', 
		 'ND4': "ND4", 'ND5': "ND5", 'ND6': "ND6", "CYB" : 'CYTB'}

position = {1 : 1, 2 : 2, 0 : 3}

# REFERENCE
# Parse genome reference and get gene refs
refGenes = {"geneName" : [],
			"startEnd" : [],
			"sequence" : []}

refGenbank = os.path.join(dirname, "../../Body/3Results/NC_012920.1.gb")
refGenbank = open(refGenbank)

for rec in SeqIO.parse(refGenbank, "genbank"):
    if rec.features:
        for feature in rec.features:
            if feature.type == "CDS":
            	# Extract gene name and append to dict
            	featureName = feature.qualifiers['gene'][0]
            	refGenes["geneName"].append(featureName)
            	# Extract gene location as range and append to dict
            	featureLocation = [int(feature.location.start), int(feature.location.end)]
            	refGenes["startEnd"].append(featureLocation)
            	# Extract gene sequence string and append to dict
            	featureSequence = str(feature.location.extract(rec).seq)
            	refGenes["sequence"].append(featureSequence)

# Turn into data frame
refGenes = pd.DataFrame.from_dict(refGenes)
Example #60
0
def open_genome():
    handle = open(genome_seq, "rU")
    record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
    handle.close()
    #print record_dict.keys()
    return record_dict