Ejemplo n.º 1
0
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(error.strip() == "" or
               error.startswith("WARNING: Sequence type is DNA.") or
               error.startswith("WARNING: DNA alignment is still experimental."))

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
def calculate_global_identity_blastp_fasta(args):
    # import proteins as fasta                                                                                                                                                       
    protein_dict = SeqIO.to_dict(SeqIO.parse(args.protein_fp, "fasta"))
    proteindb_dict = SeqIO.to_dict(SeqIO.parse(args.protdb_fp, "fasta"))

    final = open(args.output_fp + "/final_protein_identity.txt", 'w')
    for result in open(args.output_fp + "/blastp_results.txt", 'r'):
        id_number1 = result.split()[0]
        print(id_number1)
        id_number2 = result.split()[1]
        print(id_number2)


        SeqIO.write(protein_dict[id_number1], args.output_fp + "/temp_fasta1.fa", "fasta")
        SeqIO.write(proteindb_dict[id_number2], args.output_fp + "/temp_fasta2.fa", "fasta")
        
        needle = "needle -outfile=" + args.output_fp + "/temp_identity.txt -asequence " + args.output_fp + "/temp_fasta1.fa -bsequence " + args.output_fp + "/temp_fasta2.fa -gapope\
n=10 -gapextend=0.5"
        h.run_command(needle)

        identity = "NA"
        for line in open(args.output_fp + "/temp_identity.txt", 'r'):
            if line.startswith("# Identity:"):
                identity = line.split("(")[1].rstrip("\n").rstrip(")").rstrip("%")

        final.write(id_number1 + "\t" + retrieve_annotations(id_number1, args.anno_tab) + "\t"+ id_number2 + "\t" + identity + "\n")
        
    h.run_command("rm " + args.output_fp +  "/temp_*")
Ejemplo n.º 3
0
def readFASTA_SeqIO(x):
    """
    Is sequence file? Load from file if so. File should be FASTA format
    Use SeqIO
    """

    o = []
    if type(x) is list:
        for idx, i in enumerate(x):
            if os.path.isfile(i):
                with open(i, "r") as f:
                    d = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
                    o.append(d)
            else:
                o.append(x)
    elif os.path.isfile(str(x)):
        with open(x, "r") as f:
            o = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
    elif isinstance(x,dict):
        #proper offset
        if isinstance([i for i in x.values()][0], SeqIO.SeqRecord):
            o = x
        else:
            raise TypeError("input must be type filename or SeqIO.SeqRecord)")
    elif  isinstance(x, SeqIO.SeqRecord):
        o = x
    else:
        raise TypeError("input must be type filename or SeqIO.SeqRecord)")
    return o
Ejemplo n.º 4
0
def genebank_fix_n_read(gb_fname, key_func_type="gi"):
    """locations formatted as  '1^593' cause BioPython error while reading genbanks ...
    We are addressing that by fixing genbank source on the fly ..."""
    print "Reading %s with genebank records from the NCBI fetch ..." % gb_fname
    # choose the key-function based on the 'key_func_type' argument:
    if key_func_type == "gi":
        key_function = lambda rec: rec.annotations["gi"]
    if key_func_type == "id":
        key_function = lambda rec: rec.id
    print "Using %s as a key." % key_func_type
    #
    with __warnings.catch_warnings():
        # e.g. BiopythonParserWarning: Dropping bond qualifier in feature location
        #
        __warnings.simplefilter("ignore", __BiopythonParserWarning)
        #
        gb_recs_iter = __SeqIO.parse(gb_fname, "gb")
        try:
            gbrecs = __SeqIO.to_dict(gb_recs_iter, key_function=key_function)
        except ValueError, er_msg:
            print "Catched ValueError: %s" % str(er_msg)
            # #
            # Invalid between location '1^593'
            # Try to fix that thing, by replacing '^' with '..'
            file_string_io = __fix_genbank(er_msg, gb_fname)
            gb_recs_iter = __SeqIO.parse(file_string_io, "gb")
            gbrecs = __SeqIO.to_dict(gb_recs_iter, key_function=key_function)
Ejemplo n.º 5
0
    def read_seqs(self, sequence_file):
        """
        read sequences from uniprot files (.dat or .fasta) or from lists or dicts of BioPython SeqRecords
        and make them available for fast search. Appending also with this function.

        :param sequence_file: uniprot files (.dat or .fasta)
        :return:
        """
        recs = sequence_file
        if not isinstance(sequence_file, dict) and not isinstance(sequence_file, list):
            try:
                with open(sequence_file, 'rb') as f:
                    if sequence_file.endswith('.fa') or sequence_file.endswith('.fasta'):
                        recs = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
                    else:  # assume it is a dat file
                        recs = SeqIO.to_dict(SeqIO.parse(open(sequence_file), 'swiss'))
            except:
                warnings.warn("Could not read file", UserWarning)
                return
        if isinstance(sequence_file, list):
            recs = SeqIO.to_dict(sequence_file)
        if recs:
            self.collection.update(recs)
            self.searchstring = '#'.join([str(x.seq) for x in self.collection.values()]).decode('ascii')
            self.accs = self.collection.keys()
            self.idx = list()
            self.idx.append(0)
            for i, v in enumerate(self.collection.values()):
                self.idx.append(1 + self.idx[-1] + len(self.collection.values()[i].seq))
        return
Ejemplo n.º 6
0
def aa2nt_aln(aa_aln,nt_fasta,outfile):
	# store the nucleotide sequences in a dictionary
	nt_file = open(nt_fasta,"r")
	nt_dict = SeqIO.to_dict(SeqIO.parse(nt_file, "fasta"))
	nt_file.close()
	# store the nucleotide sequences in a dictionary
	aa_file = open(aa_aln,"r")
	aa_dict = SeqIO.to_dict(SeqIO.parse(aa_file, "fasta"))
	aa_file.close()

	# read through an aa sequence one site at a time
	# if the site is not a gap insert the corresponding codon into a new nt sequence
	# if it is a gap, insert three gap characters
	seq_name=1
	for seq in aa_dict:
		new_seq=""
		counter=0
		for character in aa_dict[seq]:
			if character != '-':
				if character != '*':
					new_seq = new_seq+nt_dict[seq].seq[counter:counter+3]
					counter = counter+3
				else:
					new_seq = new_seq+"---"
					counter = counter+3					
			else:
				new_seq = new_seq+"---"
		print >>outfile, ">seq"+str(seq_name)
		print >>outfile, new_seq
		seq_name=seq_name+1
	outfile.close()
Ejemplo n.º 7
0
def create_fastas(componentDict, plasmidFile):
    """Create new nucleotide and amino acid fasta files that are separate
    for each component from the assembly with ESX genes"""
    with open(plasmidFile) as infile:
        for line in infile:
            line = line.strip().split()
            strain = line[0]
            aa = 'newAnnotations/{0}_plasmid/{0}_plasmid.faa'.format(strain)
            nuc = 'newAnnotations/{0}_plasmid/{0}_plasmid.ffn'.format(strain)
            gff = 'newAnnotations/{0}_plasmid/{0}_plasmid.gff'.format(strain)
            aa_dict = SeqIO.to_dict(SeqIO.parse(aa, "fasta"))
            nuc_dict = SeqIO.to_dict(SeqIO.parse(nuc, "fasta"))
            for c in componentDict[strain]:
                new_aa = []
                new_nuc = []
                with open(gff, 'r') as gff_file:
                    for line in gff_file:
                        if line[0] != '#':
                            if line[0] == '>':
                                break
                            line = line.strip().split()
                            component = line[0][-2]
                            gene = line[8].split(';')[0][3:]
                            if component == c:
                                try:
                                    new_aa.append(aa_dict[gene])
                                    new_nuc.append(nuc_dict[gene])
                                except KeyError:
                                    print("{0} does not exist".format(gene))
                SeqIO.write(new_aa, "{0}_{1}.faa".format(strain, c), "fasta")
                SeqIO.write(new_nuc, "{0}_{1}.ffn".format(strain, c), "fasta")
Ejemplo n.º 8
0
 def test_seqfile_source ( self ) :
     """
     Test BioSeqs.from_seqfile() and BioSeqs.write() methods.
     """
     infile = 'Fasta/f001.fasta'
     self.assertTrue(os.path.isfile(infile))
     seq_db = BioSeqs.from_seqfile(infile, 'fasta')
     outfile = 'tmp_test.gb'
     outrepfile = 'tmp_test.rep'
     self.files_to_clean.add(outfile)
     self.files_to_clean.add(outrepfile)
     seq_db.write(outfile)
     self.assertTrue(os.path.isfile(outfile))
     # Check the content of both sequence files
     indict = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
     outdict = SeqIO.to_dict(SeqIO.parse(outfile, 'gb'))
     self.assertEqual(len(indict), len(outdict))
     for key, value in viewitems(indict) :
         self.assertEqual(str(value.seq), str(outdict[key].seq))
     # Check the content of the report file
     with open(outrepfile, 'r') as repfile :
         for line in repfile.readlines() :
             self.assertTrue(('Num. sequences: 50' in line) or
                 ('History:' in line) or
                 (bool(re.match(r"""\d\d\d\d/\d\d/\d\d\ \d\d:\d\d:\d\d[ ]+
                                    local[ ]+.*Tests/Fasta/f001\.fasta
                                    [ ]+fasta""", line, re.VERBOSE))))
def runAlignments(myFileProt, outDir):
    # assumes that nucleotide and amino acid files have the smae name scheme except for extension
    # faa versus fna
    
    baseName = os.path.splitext(os.path.basename(myFileProt))[0]

    myFileDna = os.path.join(os.path.dirname(myFileProt), baseName+".fna")
    protAliOutFile = os.path.join(outDir, baseName+".aln")
    pal2nalOutFile = os.path.join(outDir, baseName+".pal2nal")
    
    sequences = SeqIO.to_dict(SeqIO.parse(myFileProt, 'fasta'))
    if len(sequences) == 1:
        # nothing to align, just write to files and continute to next file
        SeqIO.write(sequences.values(), open( protAliOutFile, 'w'), 'clustal')
        sequences = SeqIO.to_dict(SeqIO.parse(myFileDna, 'fasta'))
        SeqIO.write(sequences.values(), open(pal2nalOutFile, 'w'), 'clustal')
        print "Single seq file, exiting"
        return
    clustalWCmd = ['/Users/mahdi/programs/clustalw-2.1-macosx/clustalw2', '-ALIGN',
                   '-INFILE=%s' % myFileProt, '-TYPE=PROTEIN', '-OUTFILE=%s' % protAliOutFile, '-OUTORDER=INPUT']
    # mafftCmd = ['mafft-linsi', '--amino', '--clustalout', myFileProt , '>', protAliOutFile]


    pal2nalcmd = ['perl', '/Users/mahdi/programs/pal2nal.v14/pal2nal.pl', protAliOutFile, myFileDna, '-codontable', '9',  ]
    with open(os.devnull, 'w') as fnull, open(pal2nalOutFile, 'w') as out:
        subprocess.call(clustalWCmd, stderr=fnull, stdout=fnull)
        subprocess.call(pal2nalcmd, stdout=out)
    print "completed file %s" % baseName
Ejemplo n.º 10
0
def concat_seqs(trimmedDir, outputFile):
	"""
	Pull out all the trimmed sequences associated with one genome and concatenate into one sequence.
	Then put all the sequences together in one file. 
	"""
	alignedFiles = glob.glob(trimmedDir+'/*.afa')

	#Use the first one to get the list of strains
	strainNames = [x.split('|')[0] for x in SeqIO.to_dict(SeqIO.parse(open(alignedFiles[0], 'r'), 'fasta')).keys()]

	#Create a dictionary of empty strings to contain the concatenated sequences
	concatSeqs = {k:'' for k in strainNames}


	for fileName in alignedFiles:
		with open(fileName,'r') as FID:
			seqs = SeqIO.to_dict(SeqIO.parse(FID, 'fasta'))
			for strainName, record in seqs.items():
				concatSeqs[strainName.split('|')[0]] += record.seq.tostring()

	with open(outputFile, 'w') as FID:
		for strain in strainNames:
			FID.write('>{0}\n'.format(strain))
			FID.write(concatSeqs[strain])
			FID.write('\n')
Ejemplo n.º 11
0
 def test_join ( self ) :
     """
     Test BioSeqs.join() method.
     """
     infile1 = 'Fasta/f001.fasta'
     infile2 = 'Phylip/f003.phylip'
     self.assertTrue(os.path.isfile(infile1))
     self.assertTrue(os.path.isfile(infile2))
     seq_db = BioSeqs.from_seqfile(infile1, 'fasta')
     extra_db = BioSeqs.from_seqfile(infile2, 'phylip')
     seq_db.join(extra_db)
     # Check the sequence data
     indict1 = SeqIO.to_dict(SeqIO.parse(infile1, 'fasta'))
     indict2 = SeqIO.to_dict(SeqIO.parse(infile2, 'phylip'))
     self.assertEqual(len(indict1) + len(indict2), len(seq_db))
     for key, value in viewitems(indict1) :
         self.assertEqual(str(value.seq), str(seq_db.data[key].seq))
     for key, value in viewitems(indict2) :
         self.assertEqual(str(value.seq), str(seq_db.data[key].seq))
     # Check the report information
     self.assertIn('local', seq_db._report[0][1])
     self.assertIn('Tests/Fasta/f001.fasta', seq_db._report[0][2])
     self.assertIn('fasta', seq_db._report[0][3])
     self.assertIn('local', seq_db._report[1][1])
     self.assertIn('Tests/Phylip/f003.phylip', seq_db._report[1][2])
     self.assertIn('phylip', seq_db._report[1][3])
Ejemplo n.º 12
0
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""
        self.assertTrue(str(eval(repr(cline))) == str(cline))
        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"),
                                      lambda rec : rec.id.replace(":", "_"))

        #Determine name of tree file
        if cline.newtree:
            tree_file = cline.newtree
        else:
            #Clustalw will name it based on the input file
            tree_file = os.path.splitext(cline.infile)[0] + ".dnd"

        # Mark generated files for later removal
        self.add_file_to_clean(cline.outfile)
        self.add_file_to_clean(tree_file)

        output, error = cline()
        self.assertTrue(output.strip().startswith("CLUSTAL"))
        self.assertTrue(error.strip() == "")

        #Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        #The length of the alignment will depend on the version of clustalw
        #(clustalw 2.1 and clustalw 1.83 are certainly different).
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile,"clustal"))
        self.assertTrue(set(input_records.keys()) == set(output_records.keys()))
        for record in align:
            self.assertTrue(str(record.seq) == str(output_records[record.id].seq))
            self.assertTrue(str(record.seq).replace("-", "") ==
                   str(input_records[record.id].seq))

        #Check the DND file was created.
        #TODO - Try and parse this with Bio.Nexus?
        self.assertTrue(os.path.isfile(tree_file))
Ejemplo n.º 13
0
def main():
   d = raw_input("DNA filename: ")
   p = raw_input("matching Protein filename: ")
   a_seq = SeqIO.to_dict(SeqIO.parse(p,'fasta'))
   d_seq = SeqIO.to_dict(SeqIO.parse(d,'fasta'))
   t = "\t".join(["Seqid","Cohort","NT_len","AA_len","NT_N","NT_R","NT_Y","NT_W","NT_S","NT_M","NT_K","NT_H","NT_B","NT_V","NT_D","AA_X","AA_B","AA_Z"])+"\n"
   o = open(d+".freq","w")
   o.write(t)
   for sid in a_seq.keys():
      a = sid.split('|')
      NT_len = str(len(d_seq[sid].seq))
      AA_len = str(len(a_seq[sid].seq))
      NT_N = str(d_seq[sid].seq.count('N'))
      NT_R = str(d_seq[sid].seq.count('R'))
      NT_Y = str(d_seq[sid].seq.count('Y'))
      NT_W = str(d_seq[sid].seq.count('W'))
      NT_S = str(d_seq[sid].seq.count('S'))
      NT_M = str(d_seq[sid].seq.count('M'))
      NT_K = str(d_seq[sid].seq.count('K'))
      NT_H = str(d_seq[sid].seq.count('H'))
      NT_B = str(d_seq[sid].seq.count('B'))
      NT_V = str(d_seq[sid].seq.count('V'))
      NT_D = str(d_seq[sid].seq.count('D'))
      AA_X = str(a_seq[sid].seq.count('X'))
      AA_B = str(a_seq[sid].seq.count('B'))
      AA_Z = str(a_seq[sid].seq.count('Z'))
      o.write("\t".join([a[0],a[1],NT_len,AA_len,NT_N,NT_R,NT_Y,NT_W,NT_S,NT_M,NT_K,NT_H,NT_B,NT_V,NT_D,AA_X,AA_B,AA_Z])+"\n")
   o.close()
Ejemplo n.º 14
0
    def standard_test ( self, informat, outformat, params ) :
        """
        Standard testing procedure used by all tests.

        Arguments :
            informat  ( string )
                Input file format.
            outformat  ( string )
                Output file format.
            params  ( string )
                Arguments passed to the alignment tool.
        """
        infile = '{}/f001.{}'.format(informat.capitalize(), informat)
        outfile = 'tmp_test.aln'
        self.add_file_to_clean(outfile)
        # Check the input
        self.assertTrue(os.path.isfile(infile))
        self.assertEqual(len(list(SeqIO.parse(infile, informat))), 50)
        # Generate the alignment
        Align.get_alignment(muscle_exe, infile, informat, args=params,
                            outfile=outfile, outfile_format=outformat)
        # Check the output
        self.assertTrue(os.path.isfile(outfile))
        out_align = SeqIO.to_dict(SeqIO.parse(outfile, outformat))
        prevfile = '{}/f001.muscle_{}.aln'.format(outformat.capitalize(),
                                                  params)
        self.assertTrue(os.path.isfile(prevfile))
        prev_align = SeqIO.to_dict(SeqIO.parse(prevfile, outformat))
        self.assertEqual(len(viewkeys(out_align)), len(viewkeys(prev_align)))
        for key, value in viewitems(out_align) :
            self.assertEqual(str(value.seq), str(prev_align[key].seq))
Ejemplo n.º 15
0
def init_pair(pair):
    """Return the genomes SeqIO'ed as a dict from the bases and aa."""

    genomes = {"base": [],
               "aa": []}

    for specie in pair:
        aa_genome = os.path.join(c.BASE_PATH, c.OUTPUT, specie)
        base_genome = os.path.join(c.BASE_PATH, c.GENOMES, specie)

        if not os.path.isfile(base_genome):
            raise IOError("File {0} doesn't exist".format(specie))

        # Translate both genomes only if they doesn't exist
        if not os.path.isfile(aa_genome):
            translate_fasta(base_genome,
                            output_path=os.path.dirname(aa_genome))
        # If we are here without error, append each genome to genomes
        genomes["base"].append(
            SeqIO.to_dict(SeqIO.parse(base_genome, "fasta")))
        genomes["aa"].append(
            SeqIO.to_dict(SeqIO.parse(aa_genome, "fasta")))

        genomes["base"] = clean_dict(genomes["base"])
        genomes["aa"] = clean_dict(genomes["aa"])

    return genomes
Ejemplo n.º 16
0
def main(infile, gff, outfile, ftype='CDS', use_phase=False, translate=False):
    ref_seq = SeqIO.to_dict(SeqIO.parse(infile, format="fasta"))
    # Parse GFF annotations.

    genome_with_features = GFF.parse(
        gff,
        base_dict=ref_seq
        )
    """ bcbio-gff codes exons, mRNA etc as subfeatures which is now
    depreciated in biopython, this code fixes that issue. """
    new_genome_with_features = list()
    for scaffold in genome_with_features:
        new_features = list()
        for feature in scaffold.features:
            gene_features = subfeatures(feature)
            new_features.extend(gene_features)
        scaffold.features = new_features
        new_genome_with_features.append(scaffold)
    """ Genome with features doesn't have scaffolds without any gff
    features. Here I update the existing records in genome with the
    new ones containing features. """
    ref_seq.update(SeqIO.to_dict(new_genome_with_features))

    sequences = list()
    for scaffold, sequence in ref_seq.items():
        for feature in sequence.features:
            if feature.type != ftype:
                continue
            start = feature.location.start
            end = feature.location.end
            try:
                phase = int(feature.qualifiers['phase'][0])
            except KeyError:
                phase = 0
            strand = feature.location.strand

            if use_phase:
                fseq = feature.extract(sequence)[phase:]
            else:
                fseq = feature.extract(sequence)

            fseq.id = feature.id
            fseq.name = feature.id

            strand = '-' if strand == -1 else '+'
            fseq.description = "{}:{}-{}[{}]".format(
                scaffold,
                start,
                end,
                strand,
                )
            if translate:
                tseq = fseq.seq.translate()
                fseq.seq = tseq
            sequences.append(fseq)

    SeqIO.write(sequences, outfile, 'fasta')
    return
Ejemplo n.º 17
0
def main():
	parser = OptionParser()
	parser.add_option("-i", "--input", action="store", dest="input", help="input file to make phylotree")
	parser.add_option("-g", "--germline", action="store", dest="germline", help="germline fasta")
	parser.add_option("-o", "--output" , action="store", dest="output", help="the file where you want all your data")
	(options,args) = parser.parse_args()
	if len(sys.argv) < 2:
		dowhat()
		parser.print_help()
		exit()
	
	open(options.output, 'w').write("Your Sequence Results:\n\n")
	copy(options.input, "workable.fasta")
	copy(options.germline, "germ.fasta")
	

	

	list_of_database_files = SeqIO.to_dict(SeqIO.parse("workable.fasta", "fasta"))

        
	while list_of_database_files:
		list_of_database_files = SeqIO.to_dict(SeqIO.parse("workable.fasta", "fasta"))
		populate_database("workable.fasta")
		print "***DatabasePopulated***"
		
		newsequence_search = open("germ.fasta" , "r")	
		cline = NcbiblastpCommandline(matrix="PAM30", evalue="20", word_size="2", query="germ.fasta", cmd='blastp', db="temporary_database", out="blastout")	
		newsequence_search.close
		
		print "****Cline = *** --->", cline

		call_blast(cline)
        	print "***Call_blast_successful***"
		
		result_handle = open('blastout')
		print "***result handle successful***" 
		
		blast_parser = NCBIStandalone.BlastParser()
		print "***blast_parser****"
		
		blast_record = blast_parser.parse(result_handle)
       		print "***blast_record***"
		
		newsequence_search = open("germ.fasta", 'w')
		newsequence_search.write(">" + str(blast_record.alignments[0].title[2:]) + "\n"  + str(blast_record.alignments[0].hsps[0].sbjct))		
	
		current_object = blast_record.alignments[0].title[2:]
        	print current_object
		
		newfile = open(options.output, 'a')
		newfile.write(str(blast_record.alignments[0].hsps[0].query[:]) + "----> Query\n")
        	newfile.write(str(blast_record.alignments[0].hsps[0].match[:]) + "----> Score of: " + str(blast_record.alignments[0].hsps[0].score) + "\n")
		newfile.write(str(blast_record.alignments[0].hsps[0].sbjct[:]) + "----> Template\n\n")
	
		list_of_database_files.pop(current_object)
		SeqIO.write(list_of_database_files.values(), "workable.fasta", "fasta")		
Ejemplo n.º 18
0
def fasta_dict(nast_file, inf_file):
  '''Make dictionaries of sequence objects.

  I'll assume they are untouched by jalview
  but sanity code should be here, before loading the files as dicts.
  '''
  nast_d= SeqIO.to_dict(SeqIO.parse(open(nast_file, 'rU'), 'fasta'))
  inf_d= SeqIO.to_dict(SeqIO.parse(open(inf_file, 'rU'), 'fasta'))

  return nast_d, inf_d
Ejemplo n.º 19
0
    def read_seqs(self, sequence_file):
        """
        read sequences from Ensemble protein files (.fasta) or from lists or dicts of BioPython SeqRecords
        and make them available for fast search. Appending also with this function.
        :param sequence_file: Ensemble files (.dat or .fasta)
        :return:
        """
        recs = sequence_file
        if not isinstance(sequence_file, dict) and not isinstance(sequence_file, list):
            try:
                with open(sequence_file, 'rb') as f:
                    if sequence_file.endswith('.fa') or sequence_file.endswith('.fasta'):
                        recs = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
                    else:  # assume it is a dat file
                        recs = SeqIO.to_dict(SeqIO.parse(open(sequence_file), 'swiss'))
            except:
                warnings.warn("Could not read file", UserWarning)
                return
        if isinstance(sequence_file, list):
            recs = SeqIO.to_dict(sequence_file)
        if recs:
            self.collection.update(recs)
            self.searchstring = '#'.join([str(x.seq) for x in self.collection.values()]).decode('ascii')
            self.accs = self.collection.keys()
            self.idx = list()
            self.idx.append(0)
            for i, v in enumerate(self.collection.values()):
                self.idx.append(1 + self.idx[-1] + len(self.collection.values()[i].seq))

            for i in recs.items():
                ensg = None
                enst = None
                ensp = i[0]
                ks = i[1].description.split(' ')
                for j in ks:
                    if j.startswith('transcript:'):
                        enst = j.strip('transcript:')
                    elif j.startswith('gene:'):
                        ensg = j.strip('gene:')
                if ensg and enst and ensp:
                    if ensg not in self.ensg2enst:
                        self.ensg2enst[ensg] = list()
                        self.ensg2ensp[ensg] = list()
                    self.ensg2enst[ensg].append(enst)
                    self.ensg2ensp[ensg].append(ensp)
                    self.enst2ensg[enst] = ensg
                    self.enst2ensp[enst] = ensp
                    self.ensp2ensg[ensp] = ensg
                    self.ensp2enst[ensp] = enst
                else:
                    warnings.warn("Unparsable filecontents", UserWarning)
        return
Ejemplo n.º 20
0
def readNexFile(fileName):
	seq = {}
	if fileName != '':
		# read data set file, requires biopython
		handle = open(fileName, 'r')
		if fileName.endswith('nex') or fileName.endswith('nexus'):
			seq = SeqIO.to_dict(SeqIO.parse(handle, 'nexus'))
		elif fileName.endswith('phy') or fileName.endswith('phylip'):
			seq = SeqIO.to_dict(SeqIO.parse(handle, 'phylip'))
		elif fileName.endswith('fasta'):
			seq = SeqIO.to_dict(SeqIO.parse(handle, 'fasta'))
		handle.close()
	return seq		
Ejemplo n.º 21
0
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        GFF.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])
Ejemplo n.º 22
0
def createNucleotideCluster(config):
    '''Grab Nucleotide-Sequences of ORFs and turn into unaligned clusters'''
    logging.info("Creating nucleotide clusters")
    print "Creating nucleotide-clusters"
    sequences = {}
    snp_sequences = {}
    for organism_config in config["INPUT"]:
        organism = config["INPUT"][organism_config]
        handle = open(config["OUTPUT"]["folder"]+"orfs/"+organism["prefix"]+"-nucleotide-orfs.fasta","r")
        sequences[organism["prefix"]] = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
        handle.close()
        if config["SNP"]["call_snps"] == "True":
            handle = open(config["OUTPUT"]["folder"]+"orfs/"+organism["prefix"]+"-nucleotide-orfs.fasta","r")
            snp_sequences[organism["prefix"]] = SeqIO.to_dict(SeqIO.parse(handle,"fasta"))
            handle.close()

    cluster_handle = open(config["OUTPUT"]["folder"]+"cluster/paralog-free-clusters.csv","r")
    clusters =  {}
    for line in cluster_handle:
        seq_names = {}
        line_array = line.strip().split("\t")
        cluster_name = line_array[0][:line_array[0].find("(")]
        name_array = line_array[1].split(" ")
        for name in name_array:
            prefix = name[:name.find("|")]
            cluster_id = name[name.find("|")+1:name.find("(")] 
            seq_names[prefix] = cluster_id
        clusters[cluster_name] = seq_names           

    if os.path.isdir(config["OUTPUT"]["folder"] + "cluster/nucleotide_clusters/"):
        request = "rm " + config["OUTPUT"]["folder"] + "cluster/nucleotide_clusters/*"
        subprocess.call(request, shell=True)
    else:
        os.makedirs(config["OUTPUT"]["folder"]+"cluster/nucleotide_clusters/") 

    for name,cids in clusters.items():
        out_file = open(config["OUTPUT"]["folder"] + "cluster/nucleotide_clusters/" + name + ".fasta","w")
        if config["SNP"]["call_snps"] == "True":
            snp_out_file = open(config["OUTPUT"]["folder"] + "cluster/nucleotide_clusters/" + name + "-snps.fasta","w")
        for prefix,seqid in cids.items():
            out_file.write(">"+prefix+"\n")
            out_file.write(str(sequences[prefix][seqid].seq)+"\n")
            if config["SNP"]["call_snps"] == "True":
                snp_out_file.write(">"+prefix+"\n")
                snp_out_file.write(str(sequences[prefix][seqid].seq)+"\n")
        out_file.close() 
        if config["SNP"]["call_snps"] == "True":
            snp_out_file.close()
    logging.info("Created nucleotide-clusters")
    print "Created nucleotide-clusters"
Ejemplo n.º 23
0
 def mdrMapping(self, mdrDir = '/w/simulation_fr_the_beginning/reAssemble/everybodyelse/out/pAss11'):
     '''
     Some error from the alignment process
     '''
     file = "{}/{}.fna".format(mdrDir, self.koi)
     contigs = SeqIO.to_dict(SeqIO.parse(self.query, "fasta"))
     mdrs = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
     #this is already in the description
     mdrInContig = []
     alignment = {}
     for contigID in mdrs:
         contig = str(contigs[contigID].seq).upper()
         mdr = str(mdrs[contigID].seq).upper()
         revcom = str(mdrs[contigID].seq.reverse_complement()).upper()
         #damn sian, i'm not getting paid
         norm = re.search("\(ntRev\)", mdrs[contigID].description)
         if norm != None:
             seq = mdr
             sign = '+'
             logging.debug("norm {}".format(contigID))
         else:
             seq = revcom
             sign = '-'
             logging.debug("revcom {}".format(contigID))
         if seq in contig:
             start = contig.find(seq)
             nGapsS = 0
             nGapsQ = 0
         else:
             logging.info("Complete alignment fail : {}".format(contigID))
             alignments = [a for a in pw.align.globalms(contig, seq,  5, -4, -10, -1)]
             alignment[contigID] = alignments[0]
             start = abs(len(re.sub("^-+", "", alignments[0][1])) - len(alignments[0][1]))
             #Tracer('Linux')()
             nGapsS = re.sub("-+$", "", re.sub("^-+", "", alignments[0][0])).count("-")
             nGapsQ = re.sub("-+$", "", re.sub("^-+", "", alignments[0][1])).count("-")
         df = pd.DataFrame({
             'contig': contigID,
             'start': start,
             'end' : start + len(seq),
             'length': len(seq),
             'strand': sign,
             'sGaps': nGapsS,
             'qGaps': nGapsQ,
         }, index=[1])
         mdrInContig.append(df)
     df = pd.concat(mdrInContig, ignore_index=True)
     df['ko'] = self.koi
     return {'df':df, 'alignments': alignment}
Ejemplo n.º 24
0
 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     recs = SeqIO.to_dict(GFF.parse(self._gff_file))
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"
Ejemplo n.º 25
0
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
Ejemplo n.º 26
0
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file):
	seq_dir = log_line.split("\t")[1]
	tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position
	gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
	gff_handle = open(gff_file,'r')
	for gff_line in gff_handle:
		if(re.search("est2gneome",gff_line) and \
		re.search("\texpressed_sequence_match\t",gff_line)):
			curr_start = int(gff_line.split("\t")[3])
			curr_stop = int(gff_line.split("\t")[4])
			curr_strand = gff_line.split("\t")[6]
			
			tmp_handle = open(velvet_file,'r')
			tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta"))
			tmp_handle.close()
		
			if seq_dir.split("/")[3] in tmp_fasta:
				curr_record = tmp_fasta[seq_dir.split("/")[3]]
			else:
				continue
			new_seq = curr_record.seq[curr_start - 1:curr_stop]
			if(curr_strand == "-"):
				new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement()
			new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="")
				
			SeqIO.write(est2genome_handle,"fasta")
Ejemplo n.º 27
0
 def _get_seq_dict(self):
     """Internal reusable function to get the sequence dictionary.
     """
     seq_handle = open(self._test_seq_file)
     seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
     seq_handle.close()
     return seq_dict
Ejemplo n.º 28
0
def main():
    if len (sys.argv) != 4 :
        print "Please provide file, the file format, and the desired file format "
        sys.exit (1)
    else:
        f = sys.argv[1]
        fout = "".join(f.split('.')[:-1])
        formatin = sys.argv[2]
        formatout  = sys.argv[3]
        if formatout == 'nexus':
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna)
        if formatout == 'mega':
            handle = open(f, "rU")
            record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed"))
            handle.close()
            
            outfile = open(fout+'.'+formatout,'w')
            outfile.write('#mega'+"\n")
            outfile.write('!Title Mytitle;'+"\n")
            outfile.write('!Format DataType=DNA indel=-;'+"\n\n")
            
            for n in record_dict:
                outfile.write('#'+n+"\n")
                newseq=wrap(str(record_dict[n].seq),60)
                for s in newseq:
                    outfile.write(s+"\n")
            
            outfile.close()
        else:
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
Ejemplo n.º 29
0
 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file))
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32
def check_reference_alignment(reference_alignment, pdb_files_dir):
    '''Try to read alignment (with Bio.SeqIO); Exit if reading fails;
    Print warining message (but do not exit) if a structure is not in the
    alignment'''
    print('\nChecking reference alignment: %s' % reference_alignment)
    try:
        ref_alignment = SeqIO.parse(reference_alignment, 'fasta')
        print('Valid alignment present: %s' % reference_alignment)
    except:
        print('Alignment not present or not in required format (fasta).')
        print('Please check the file "%s".' % reference_alignment)
        sys.exit(1)
    
    seq_names_alignment = SeqIO.to_dict(ref_alignment).keys()
    structures = os.listdir(pdb_files_dir)
    not_in_alignment = [x for x in structures if x not in seq_names_alignment]
    if len(not_in_alignment) > 0:
        print('WARNING: The sequence of the following %s PDB-structures is no '
            'present in the reference alignment and will be excluded from '
            'analysis:\n%s' % (len(not_in_alignment), not_in_alignment))
        print('If the dataset is large enought without these structures, this '
            'is not a problem.')
        print('Note: The sequences in the alignment must have the same names '
            'as their corresponding structures in the directory "%s".'
            % pdb_files_dir)
    else:
        print('All structures are present in the reference alignment.')
Ejemplo n.º 31
0
def methyl(args):

    global global_ref_c
    global nr_of_reads
    #meth_dict = {'Methylation_List','Family','UUID','position'}
    #CG_dict = {'Chr','Start','Stop'}
    print "Loading the bam file"
    samfile = pysam.AlignmentFile(args.bam, "rb")

    ref = SeqIO.to_dict(SeqIO.parse(open(args.r), 'fasta'))
    faChrCheck = "chr" in SeqIO.parse(open(args.r), 'fasta').next().id
    header = ["chr", "start", "stop", "UUID", "methylated CpGs", "position"]
    CpG = pd.DataFrame(columns=header)
    if ":" in args.region and "-" in args.region:
        chrom = args.region.split(":")[0]
        start = int(args.region.split(":")[1].split("-")[0])
        end = int(args.region.split("-")[1])
        if not faChrCheck:
            chrom = args.region.split(":")[0][3:]

    else:
        #TODO: add system.stderr message
        print "Either no region given or the region is not given in the needed way chr:start-stop. If it was not your intention, please refer to the argument --region"
        print "We will infer that your reference only contains one element, i.e. an L1 element you want to align agains"
        ## infering chrom and start stop. There should be only one element in the input fasta
        chrom = ref.keys()[0]
        start = 0
        end = len(ref.get(ref.keys()[0]))
        #print "chrom: %s" %chrom
        #print "!"*20
        #print "sequence: %s " %ref.get(chrom)[start:end].seq
        #print "!"*20

    ## fill the dataframe with all CpGs from the reference to then check the methylation status of each read for each CpG
    referenceCpGs = initializeCpG(CpG,
                                  ref.get(chrom)[start:end].seq.upper(), chrom,
                                  start, end)

    ## checking if bam file is indexed by samtools
    if not samfile.check_index():
        print "Input bam file needs to be indexed!"
        print "Please index the bam file using the following command: samtools index " + args.bam
        exit(1)

    global_ref_c = len(referenceCpGs)
    if args.region == "":
        for read in samfile.fetch():
            #print read
            nr_of_reads += 1
            CpG = updateCpG(read, CpG, faChrCheck, referenceCpGs, start,
                            args.strict_cpg)

    else:
        for read in samfile.fetch(
                args.region.split(":")[0],
                int(args.region.split(":")[1].split("-")[0]),
                int(args.region.split("-")[1])):
            #print read
            nr_of_reads += 1
            CpG = updateCpG(read, CpG, faChrCheck, referenceCpGs, start,
                            args.strict_cpg)

    CpG = equalizeCpG(CpG)
    samfile.close()
    return CpG.dropna(thresh=4)
Ejemplo n.º 32
0
def _load_helitrons(helitrons_file):
    return SeqIO.to_dict(SeqIO.parse(helitrons_file, "fasta"))
if len(sys.argv) != 6:
    print(
        'please enter 5 command line arguments to run this script, example to run script i.e. python3 peptides2sequenceStringMatching.py dir/2/peptides/file dir/to/allProteinsSeqs/ dir/to/output/directory n_threads suffix for the files to search into'
    )

else:
    peptides_seq_f = sys.argv[1]
    prot_seqs_dir = sys.argv[2]
    out_dir = sys.argv[3]
    n_threads = int(sys.argv[4])
    suffix = sys.argv[5]

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    print('reading all peptide sequences into memory')
    peptide_seqs = SeqIO.to_dict(SeqIO.parse(peptides_seq_f, 'fasta'))
    print('reading all proteins into memory')
    all_prot_files = [
        file for file in os.listdir(prot_seqs_dir) if file.endswith(suffix)
    ]

    peptide_seq_list = [(peptide_id, str(peptide_seqs[peptide_id].seq))
                        for peptide_id in peptide_seqs]

    new2oldPeptide_dic = dict()
    new_peptide_list = list()

    print('converting peptides I 2 L')
    for peptide in peptide_seq_list:
        new_peptide = peptide[1].replace('I', 'L')
        new_tuple = (peptide[0], new_peptide)
Ejemplo n.º 34
0
 def read_fasta(self):
     """
 Reads a fasta file using SeqIO
 """
     self.dict_genome = SeqIO.to_dict(SeqIO.parse(self.file, "fasta"))
Ejemplo n.º 35
0
def get_fasta(genome_file):
    genome_dict = SeqIO.to_dict(SeqIO.parse(genome_file, "fasta"))
    contig2length = {}
    for j in genome_dict:
        contig2length[j] = len(genome_dict[j])
    return genome_dict, contig2length
Ejemplo n.º 36
0
import os
import sys
import subprocess
import re
import shlex
from Bio import SeqIO
from collections import defaultdict

proteinortho = open(sys.argv[1], "r")
proteins = open(sys.argv[2], "r")
ortho_out = open(sys.argv[1] + ".cluster", "w")

# get protein lengths
ldict = {}
prot_dict = SeqIO.to_dict(SeqIO.parse(proteins, "fasta"))
for i in prot_dict:
    record = prot_dict[i]
    ldict[record.id] = len(record.seq)

# parse cluster IDs
tally = 0
cluster_list = []
longest_dict = defaultdict(float)
final_dict = {}
rep_list = []
for i in proteinortho.readlines():
    if i.startswith("#"):
        name = re.sub(".names.faa", "", i)
        name2 = re.sub("# ", "", name)
        ortho_out.write("Cluster\tRepresentative\tRep_Length\t" + name2)
    else:
Ejemplo n.º 37
0
                ".", "%2E")  #hardcoded in this position
            gff_file = refseq.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
            gff_handle = open(gff_file, 'r')
            for gff_line in gff_handle:
                if(re.search("est2genome",gff_line) and \
                re.search("\texpressed_sequence_match\t",gff_line)):
                    curr_start = int(gff_line.split("\t")[3])
                    curr_stop = int(gff_line.split("\t")[4])
                    curr_strand = gff_line.split("\t")[6]
                    should_be_length = curr_stop - curr_start

                    print gff_line
                    print should_be_length

                    tmp_handle = open(filesPerSeqID[refseq.id], 'r')
                    tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle, "fasta"))
                    tmp_handle.close()

                    if seq_dir.split("/")[3] in tmp_fasta:
                        curr_record = tmp_fasta[seq_dir.split("/")[3]]
                    else:
                        continue
                    new_seq = curr_record.seq[curr_start - 1:curr_stop]
                    if (curr_strand == "-"):
                        new_seq = curr_record.seq[
                            curr_start - 1:curr_stop].reverse_complement()
                    print len(new_seq)
                    new_record = SeqRecord(new_seq,
                                           id=seqname,
                                           name=seqname,
                                           description="")
Ejemplo n.º 38
0
def generate_genbank(
        input_fna, input_gff3, input_faa, output_prefix, organism_name,
        data_file_division, taxonomy):
    '''Generate GenBank format'''
    # Output file name
    outfile = '{}.gb'.format(output_prefix)

    # First, import input_fna in dictionary
    d_fna = SeqIO.to_dict(SeqIO.parse(input_fna, 'fasta'))
    d_faa = SeqIO.to_dict(SeqIO.parse(input_faa, 'fasta'))

    d_fna_sorted = sorted(
        d_fna.items(),
        key=lambda x: int(re.findall(r'\d+', x[0])[0]))

    # Make dictionary for CDS
    d_cds = defaultdict(list)
    d_exon = defaultdict(list)
    for record in parse_gff3(input_gff3):
        if record.type == 'exon':
            exon_parent = record.attributes['Parent']
            d_exon[exon_parent].append(record)

        elif record.type == 'CDS':
            cds_parent = record.attributes['Parent']
            d_cds[cds_parent].append(record)

    my_seq_records = []
    for scaffold, seq in d_fna_sorted:
        my_seq = Seq(str(seq.seq))
        my_seq_record = SeqRecord(my_seq)
        my_seq_record.description = '{} {}'.format(organism_name, scaffold)
        date = datetime.today().strftime('%d-%^b-%Y')
        my_seq_record.annotations['date'] = date
        my_seq_record.annotations['organism'] = organism_name
        my_seq_record.data_file_division = data_file_division
        my_seq_record.annotations['keywords'] = [
            'Whole genome sequencing project']
        my_seq_record.annotations['taxonomy'] = taxonomy.split('; ')
        my_seq_record.annotations['source'] = organism_name

        for record in parse_gff3(input_gff3):
            if scaffold != record.seqid:
                continue

            my_feature_type = record.type
            if my_feature_type == ('exon', 'CDS'):
                continue

            # GFFRecord(seqid='contig1', source='AUGUSTUS', type='gene',
            # start=16942, end=19008, score=0.22, strand='+', phase=None,
            # attributes={'Source': 'braker_Y1:g3308.t1', 'ID': 'Triga_00001'})

            my_start = record.start
            my_end = record.end
            my_strand = 1 if record.strand == '+' else -1

            # Set qualifies for gene
            if my_feature_type == 'gene':
                gene_start = my_start
                gene_end = my_end
                gene_feature_location = FeatureLocation(
                    gene_start, gene_end, strand=my_strand)
                gene_qualifiers = {}
                gene_locus_tag = record.attributes['ID']
                gene_qualifiers['locus_tag'] = gene_locus_tag
                gene_feature = SeqFeature(
                    gene_feature_location, type=my_feature_type,
                    qualifiers=gene_qualifiers)
                # Append my feature to seq_record
                my_seq_record.features.append(gene_feature)

            elif my_feature_type == 'mRNA':
                sorted_exon_records = sorted(
                    d_exon[record.attributes['ID']], key=lambda x: x.start)
                sorted_cds_records = sorted(
                    d_cds[record.attributes['ID']], key=lambda x: x.start)

                # Feature locations
                # mRNA location is needed to be modified
                fl_mrna_list = []
                for exon_record in sorted_exon_records:
                    fl_element = FeatureLocation(
                        exon_record.start, exon_record.end, strand=my_strand)
                    fl_mrna_list.append(fl_element)

                if len(fl_mrna_list) == 1:
                    mrna_feature_location = fl_mrna_list[0]
                else:
                    mrna_feature_location = CompoundLocation(fl_mrna_list)

                fl_cds_list = []
                for cds_record in sorted_cds_records:
                    fl_element = FeatureLocation(
                        cds_record.start, cds_record.end, strand=my_strand)
                    fl_cds_list.append(fl_element)

                # If fl_cds_list is more than 1 use CompoundLocation
                if len(fl_cds_list) == 1:
                    cds_feature_location = fl_cds_list[0]
                else:
                    cds_feature_location = CompoundLocation(fl_cds_list)

                # Qualifier
                mrna_qualifiers = {}
                cds_qualifiers = {}

                mrna_locus_tag = record.attributes['ID']
                mrna_qualifiers['locus_tag'] = mrna_locus_tag
                if record.score:
                    mrna_qualifiers['note'] = 'prediction score=%s' % (
                        record.score)

                cds_qualifiers['locus_tag'] = mrna_locus_tag
                # Get phase
                if my_strand == 1:
                    phase = int(sorted_cds_records[0].phase) + 1
                elif my_strand == -1:
                    phase = int(sorted_cds_records[-1].phase) + 1
                cds_qualifiers['codon_start'] = phase
                cds_qualifiers['translation'] = str(d_faa[mrna_locus_tag].seq)

                mrna_feature = SeqFeature(
                    mrna_feature_location, type='mRNA',
                    qualifiers=mrna_qualifiers)

                cds_feature = SeqFeature(
                    cds_feature_location, type='CDS', qualifiers=cds_qualifiers)
                # Append my feature to seq_record
                my_seq_record.features.append(mrna_feature)
                my_seq_record.features.append(cds_feature)
        my_seq_records.append(my_seq_record)

    SeqIO.write(my_seq_records, outfile, 'genbank')
Ejemplo n.º 39
0
def writeallAA(gff, genomefasta):
	#Make gff database
	print 'Indexing gff...'
	gff_fn = gff
	db_fn = os.path.abspath(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)

	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	print 'Indexing genome sequence...'
	if os.path.basename(genomefasta).endswith('.gz'):
		seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta), 'fasta'))
	else:
		seq_dict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
	print 'Done indexing!'

	
	aaseqs = {} #{ensg_enst : aaseq}
	genes = db.features_of_type('gene')

	genecounter = 0
	for gene in genes:
		genecounter +=1
		if genecounter % 5000 == 0:
			print 'Gene {0}...'.format(genecounter)
		
		#Only protein-coding genes
		if 'protein_coding' not in gene.attributes['gene_type']:
			continue

		for transcript in db.children(gene, featuretype = 'transcript'):
			#Only protein-coding transcripts
			if 'protein_coding' not in transcript.attributes['transcript_type']:
				continue
			#Stitch together CDS pieces
			cdsseq = ''

			if transcript.strand == '+':
				for cds in db.children(transcript, featuretype = 'CDS', order_by = 'start'):
					try:
						seq = seq_dict[transcript.chrom].seq[cds.start - 1 : cds.end]
					except KeyError: #This chromosome isn't in the fasta
						print 'Chromosome {0} is not in the genome sequence fasta.'.format(transcript.chrom)
						continue
					cdsseq += seq

			elif transcript.strand == '-':
				for cds in db.children(transcript, featuretype = 'CDS', order_by = 'start', reverse = True):
					try:
						seq = seq_dict[transcript.chrom].seq[cds.start - 1 : cds.end].reverse_complement()
					except KeyError: #This chromosome isn't in the fasta
						print 'Chromosome {0} is not in the genome sequence fasta.'.format(transcript.chrom)
						continue
					cdsseq += seq

			if cdsseq:
				aaseq = str(cdsseq.translate())
				aaseqs[gene.id + '_' + transcript.id] = aaseq

	fcounter = 1
	txcounter = 0
	totaltxcounter = 0
	longtxcounter = 0
	outfh = open('AAseqs_{0}.fa'.format(fcounter), 'w')
	for tx in aaseqs:
		txcounter +=1
		totaltxcounter +=1
		if txcounter == 10000:
			txcounter = 1
			fcounter += 1
			outfh.close()
			outfh = open('AAseqs_{0}.fa'.format(fcounter), 'w')
		if len(aaseqs[tx]) < 8000: 
			outfh.write('>' + tx + '\n' + aaseqs[tx] + '\n')
		else: 
			longtxcounter +=1

	outfh.close()

	print totaltxcounter, longtxcounter
Ejemplo n.º 40
0
def outparalog_separation(fasta, cfg, **kargs):
    """
    Main function for STEP4:
    interp. 
        kmer (int)                      The number of consecutive bases to be examine
                                        for computing the Kmer score.
        min_sequences_overlap (int)     The minimum overlap between two sequences  
                                        required for computing pairwise distances with
                                        the EMBOSS dismat software.
        outgroups (list(str))           List of the outgroups used to separate putative
                                        outparalogs.
        max_alleles (int)               Maximum number of putative alleles allowed 
                                        within a cluster during the cluster fusion step. 
                                        The final number of putative alleles in the 
                                        alignemnt can exceed this value in latter steps.
        max_jaccard_value (float)       Value used during the component fusion step.
                                        Pairs of components that exceed this value of 
                                        the Jaccard Index (JI) for their taxa are not 
                                        considered for fusion. Lower value means higher
                                        stringency.
        min_association_ratio (float)   Parameter that together with the Jaccard Index
                                        value controles the stringency of the fusion of
                                        components. A higher ratio increases the 
                                        stringency.
        max_distance (float)            Max distance between sequences allowed during
                                        the assignment of unclustered sequences to their
                                        components.
        min_taxa_in_alignment (int)     Number of taxa to serve as threshold between
                                        what is deemed to be a 'complete' alignment and  
                                        an 'imcomplete' one.
        large_component_ratio (float)   Value of the Size ratio of the largest component
                                        that is used to distinguish small components:
                                        a small component has
                                        size < (size largest) * large_component_ratio.
                                        Large components are used during the clustering
                                        phase with JI.
    """
    in_suffix = cfg["input_suffix"]
    out_suffix = cfg["output_suffix"]
    out_folder = cfg["output_folder"]
    kmers = cfg["kmers"]
    min_overlap = cfg["min_sequences_overlap"]
    outgroups = cfg["outgroups"]
    max_alleles = cfg["max_alleles"]
    max_jaccard = cfg["max_jaccard_value"]
    association_ratio = cfg["min_association_ratio"]
    max_dist = cfg["max_distance"]
    min_taxa = cfg["min_taxa_in_alignment"]
    large_comp_ratio = cfg["large_component_ratio"]
    basename = fasta.split(in_suffix)[0]
    error_template = "An exception of type {0} occurred when trying to \
 {2}. Arguments:\n{1!r}"

    DEBUG = True

    try:
        assert (0 <= cfg["min_association_ratio"] <= 1
                and 0 <= cfg["max_jaccard_value"] <= 1
                and 0 <= cfg["large_component_ratio"] <= 1)
    except:
        s = "Wrong parameters: 'min_association_ratio', 'max_jaccard_value' \
         and large_component_ratio must be in [0, 1]"

        raise ValueError(s)
        sys.exit(s)

    try:
        cfg["dna_model"] in [0, 1, 2, 3, 4, 5]
    except:
        s = "Wrong parameters: 'dna_model' must be in [0, 1, 2, 3, 4, 5]"
        raise ValueError(s)
        sys.exit(s)

    try:
        records = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    except:
        exception = "[Error]: Cannot open alignment {}\n\
        Verify that the file is in fasta format".format(fasta)
        return exception

    n_seq = len(records)
    try:
        assert n_seq > 3
    except:
        s_i = "STEP4 Done with fasta: {}".format(fasta)
        s_d = "No transformation performed on fasta {0}\n\
        which only contains {1} sequences".format(fasta, n_seq)
        return (s_i, s_d)

    length = len(list(records.values())[0])
    try:
        assert min_overlap < length
    except:
        exception = "[Error]: for fasta {} 'min_sequences_overlap'\
        exceeds the length of the alignment".format(fasta)
        return (exception, None)
    if DEBUG:
        print("working on fasta {}".format(fasta))
    #  1- Remove empty sequences.
    if DEBUG:
        print("1- Remove empty sequences.")
    for record in records.keys():
        if set(list(str(records[record].seq))) == set(["-"]):
            records.pop(record, None)

    #  2- Return the original aln if no outgroup
    if (set(outgroups).intersection(
            set([x.split("|")[0] for x in records.keys()])) == set()):
        SeqIO.write(
            [x[1] for x in sorted(records.items(), key=lambda x: x[0])],
            basename + out_suffix + "_no_outgroup.fasta",
            "fasta",
        )
        shutil.move(basename + out_suffix + "_no_outgroup.fasta", out_folder)
        s_i = "Done with fasta {}".format(fasta)
        s_d = "No outgroup is present for fasta {}".format(fasta)
        return (s_i, s_d)

    #  Calculate distance matrix used to assign sequences to components
    if DEBUG:
        print("Calculate distance matrix")
    calculate_distance_matrix(fasta, cfg)
    taxa = {records[rec].name: str(records[rec].seq) for rec in records.keys()}
    #  For each sequence we recode 0 for '-', 1 otherwise,
    #  into a dictionary {name_number: profile}
    taxa_profiles = sequence_profiler(taxa)
    if DEBUG:
        print("Test different kmer in the kmers list")
    #  3- Test different kmer in the kmers list in order to obtain several components.
    #
    retained_comp = None
    for kmer in kmers:
        idxs_length = len(list(records.values())[0].seq) - kmer
        if idxs_length <= 20:
            s_i = "Done with fasta {}".format(fasta)
            s_d = "In fasta {0} with kmer {1}, \n\
            the alignment is too short to perform STEP4".format(fasta, kmer)
            return (s_i, s_d)

        comp_1, aln_dict, accepted_outgroup = obtain_components(
            fasta, taxa, idxs_length, kmer, outgroups, min_overlap,
            taxa_profiles)
        if DEBUG:
            if comp_1:
                print("# components: {0} with Kmer {1}".format(
                    len(comp_1), kmer))
        if comp_1 is None:
            continue
        elif comp_1 is not None and len(comp_1) == 1:
            if validate_component(comp_1[0], taxa_profiles, min_overlap,
                                  max_alleles):
                retained_comp = comp_1
                break
            else:
                retained_comp = comp_1
        else:
            retained_comp = comp_1
            break

    #  4- If no components are obtained, return the original aln.
    #
    if retained_comp is None or len(retained_comp) == 0:
        SeqIO.write(
            [x[1] for x in sorted(records.items(), key=lambda x: x[0])],
            basename + out_suffix + "_no_splitting.fasta",
            "fasta",
        )
        try:
            shutil.move(basename + out_suffix + "_no_splitting.fasta",
                        out_folder)
        except Exception as ex:
            s = "move fasta: {}".format(fasta)
            message = error_template.format(type(ex).__name__, ex.args, s)
            print(message)

        s_i = "Done with fasta {}".format(fasta)
        s_d = "No components were found,\n\
            using kmer {}: no inparalog removal was attempted".format(kmer)
        return (s_i, s_d)

    #  4. If a single component is found, use it to assign the sequence
    #     no included based on a distance method.
    if DEBUG:
        print("Cluster with distances")
    #  Common parameter for all calls of dist_assignment()
    dist_assignmet_para = [
        basename,
        out_folder,
        max_dist,
        min_taxa,
        outgroups,
        min_overlap,
        out_suffix,
        large_comp_ratio,
    ]
    if len(retained_comp) == 1:
        final_comp = dist_assignment(retained_comp, records, taxa_profiles,
                                     *dist_assignmet_para)
        if DEBUG:
            print("writing a single component")
        write_fasta(final_comp, records, basename, out_folder, min_taxa,
                    out_suffix)

        s_i = "Done with fasta {}".format(fasta)
        s_d = "# retained components: {}".format(len(final_comp))
        return (s_i, s_d)

    #  5. If there are some components, compute the Jaccard index
    #            between pair of components:
    else:
        jaccard = []
        for pair in itertools.combinations(retained_comp, 2):
            jaccard.append(calculate_Jaccard_index(pair))
        jaccard_filtered = [x for x in jaccard if x[1] <= max_jaccard]  ###
        #
        #  6. case 1: Jaccard index can be calculated,
        #     but is set too high therefore clustering all components.
        #     No additional component clustering is performed and seq
        #     are assigned based on a distance method.
        if len(jaccard_filtered) == 0:
            if DEBUG:
                print("Cluster without Jaccard")
            final_comp = dist_assignment(comp_1, records, taxa_profiles,
                                         *dist_assignmet_para)
            if DEBUG:
                print("writing a several component")
            write_fasta(final_comp, records, basename, out_folder, min_taxa,
                        out_suffix)

            s_i = "Done with fasta {}".format(fasta)
            s_d = "# retained components: {}".format(len(comp_1))
            return (s_i, s_d)
        #
        #  6. case 2: Jaccard index can be calculated and is below the set
        #     maximum. Associations between components are checked to see
        #     if they can be justified given the data.
        #     Remaining individual sequences are clustered by distance.
        elif len(jaccard_filtered) > 0:
            #  At each position we calculate the components.
            if DEBUG:
                print("Cluster with Jaccard")
            JI_gp = {}
            for pair in jaccard_filtered:
                gp = validate_associations(pair, list(aln_dict.values()))
                JI_gp[(tuple(pair[0][0]), tuple(pair[0][1]))] = {
                    "JI": pair[1],
                    "gp": gp,
                }

            edges = [(
                key[0],
                key[1],
                {
                    "dist":
                    JI_gp[key]["gp"][0] /
                    (JI_gp[key]["gp"][0] + JI_gp[key]["gp"][1])
                },
            ) if (JI_gp[key]["gp"][1] + JI_gp[key]["gp"][0]) != 0 else
                     (key[0], key[1], {
                         "dist": 0.0
                     }) for key in JI_gp.keys()]
            filtered_edges = [
                e for e in edges if e[2]["dist"] > association_ratio
            ]

            #  Separate components clustered with Jaccard index
            #   from unclustered components.
            single_edges = []
            for e in filtered_edges:
                single_edges.append(e[0])
                single_edges.append(e[1])
            #  Components that are linked by an edge
            L_single_edges = list(set(single_edges))
            #  Components that lack an edge to any other component
            #  are separated from the rest
            unclustered_comp = []
            for e in edges:
                if e[0] not in L_single_edges:
                    unclustered_comp.append(e[0])
                if e[1] not in L_single_edges:
                    unclustered_comp.append(e[1])
            L_unclustered_comp = list(set(unclustered_comp))
            #  Compatible components are clustered and the remaining
            #  components are assigned using a distance method.

            if filtered_edges:
                #  Assembly of components that are associated with JI
                G_2 = nx.Graph()
                G_2.add_edges_from(filtered_edges)
                filtered_comp = [
                    list(set(itertools.chain(*c.nodes())))
                    for c in nx.connected_component_subgraphs(G_2)
                ]
                comp_2 = filtered_comp + [list(x) for x in L_unclustered_comp]
                #  add the seq from components that have not been clustered
                final_comp = dist_assignment(comp_2, records, taxa_profiles,
                                             *dist_assignmet_para)

                write_fasta(final_comp, records, basename, out_folder,
                            min_taxa, out_suffix)
                s_i = "Done with fasta {}".format(fasta)
                s_d = "# retained components: {}".format(len(final_comp))
                if DEBUG:
                    print("Components are compatible, " + s_d)
                return (s_i, s_d)

            #  Components are incompatible.
            #  Return all components as separated alignments.
            #  Add unclustered seq using a distance method.
            else:
                final_comp = dist_assignment(comp_1, records, taxa_profiles,
                                             *dist_assignmet_para)
                write_fasta(final_comp, records, basename, out_folder,
                            min_taxa, out_suffix)
                s_i = "Done with fasta {}".format(fasta)
                s_d = "# retained components: {}".format(len(final_comp))
                if DEBUG:
                    print("Components are incompatible, " + s_d)
                return (s_i, s_d)
parser = argparse.ArgumentParser(
    description="Output reads that didn't classifiy to a set of refseqs")
parser.add_argument("RefSeq",
                    type=str,
                    help="File of Sequences that we care about")
parser.add_argument("taxonomer_out",
                    type=str,
                    help="Taxonomer classifier output")
parser.add_argument("fasta",
                    type=str,
                    help="fasta file of reads that we want to filter")
args = parser.parse_args()

#make a hash of the the reference sequences
handle = open(args.RefSeq, 'r')
refseq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
handle.close()

#Make a hash of the taxonomer output
#If a thing didn't classifiy to one of the reference sequences, count it as unclassified
classified_hash = {}
handle = open(args.taxonomer_out, 'r')
for line in handle:
    parts = line.split("\t")

    if (parts[0] == "U"):
        classified_hash[parts[1]] = 1
    else:
        if (not parts[3] in refseq_dict):
            classified_hash[parts[1]] = 1
handle.close()
Ejemplo n.º 42
0
def cDNAcoordstoseq(cDNAcoords, genomefasta, outputfasta):
    cDNAseqs = {}  #{ENSTRANS : seq}
    print 'Indexing genome sequence...'
    seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta), 'fasta'))
    print 'Done indexing!'
    chrmswithoutseq = [
    ]  #Chromsome names that are in allCDScoords but that don't have a fasta entry in genomefasta

    for transcript in cDNAcoords:
        cDNAseq = ''
        #Ensembl vs UCSC transcript names
        #Ensembl
        if 'NM_' not in transcript and 'NR_' not in transcript:
            tname = transcript.split('_')[0].replace(
                'transcript:', ''
            )  #in cDNAcoords txnames are like 'transcript:ENSMUST00000038375_chr2_+'
            chrm = transcript.split('_')[1]
            strand = transcript.split('_')[2]

        #UCSC
        elif 'NM_' in transcript or 'NR_' in transcript:
            if ':' in transcript:
                tname = transcript.split('_chr')[0].split(':')[
                    1]  #transcript:NM_175684_chr18_-
            else:
                tname = transcript.split('_chr')[0]
            chrm = transcript.split('_')[-2]
            strand = transcript.split('_')[-1]

        #Is this chromosome in genomefasta?
        if chrm not in seq_dict:
            if chrm not in chrmswithoutseq:
                print 'WARNING: No entry for chromosome {0} in genomefasta.'.format(
                    chrm)
                print transcript
                chrmswithoutseq.append(chrm)
            continue

        for exon in cDNAcoords[transcript]:
            start = exon[0]
            end = exon[1]
            if strand == '+':
                exonseq = seq_dict[chrm].seq[start - 1:end].upper()
                cDNAseq += exonseq
            elif strand == '-':
                exonseq = seq_dict[chrm].seq[start - 1:end].reverse_complement(
                ).upper()
                newseq = exonseq + cDNAseq
                cDNAseq = newseq

        cDNAseqs[tname] = cDNAseq

    print 'Found sequence data for {0} of {1} transcripts.'.format(
        len(cDNAseqs), len(cDNAcoords))

    with open(outputfasta, 'w') as f:
        for tname in cDNAseqs:
            seq = str(cDNAseqs[tname])

            #If transcript IDs have a dot (like ENSMUST0000046543.2)
            if '.' in tname:
                tname = tname.split('.')[0]
            f.write('>' + tname + '\n' + seq + '\n')

    return cDNAseqs
Ejemplo n.º 43
0
    sys.exit()

#open the BLAST files
try:
    blast1F = open(args.blast1, 'r')
except IOError:
    print("\n BLAST 1 not found in folder.")
    sys.exit()
try:
    blast2F = open(args.blast2, 'r')
except IOError:
    print("\n BLAST 2 not found in folder.")
    sys.exit()

#read in the protein file of isolate 1 and for each protein, get the length and name (SeqIO auto cuts it at first space) and save to a dictionary
prots1all = SeqIO.to_dict(SeqIO.parse(args.protein1, "fasta"))
prots1 = {}
for prot in prots1all:
    prots1[prot] = len(prots1all[prot].seq)
#repeat for the protein file of isolate 2
prots2all = SeqIO.to_dict(SeqIO.parse(args.protein2, "fasta"))
prots2 = {}
for prot in prots2all:
    prots2[prot] = len(prots2all[prot].seq)

#get the totals
T1 = len(prots1)
T2 = len(prots2)

#go through the first BLAST file and for each protein that has a match meeting the minimum requirements, add 1 to the counter and the protein name to the list (to avoid counting the same protein twice)
C1 = 0.0
Ejemplo n.º 44
0
        print(str(err))
        sys.exit(2)
    for o, a in opts:
        if o in ('-m', '--mode'):
            mode = a
        if o in ('-p', '--path'):
            path = a
        if o in ('-w', '--window'):
            window = a
    mode2serial = {
        'transcript_train': '0',
        'transcript_test': '2',
        'cdna_train': '1',
        'cdna_test': '3'
    }

    human_seq = SeqIO.to_dict(
        SeqIO.parse(path + "human_" + mode + '.txt', "fasta"))
    mouse_seq = SeqIO.to_dict(
        SeqIO.parse(path + "mouse_" + mode + '.txt', "fasta"))
    human_site = open(path + "human_pku" + mode2serial[mode], "r")
    mouse_site = open(path + "mouse_pku" + mode2serial[mode], "r")
    pos_sample = open(path + window + '/' + mode + "/p_samples", "w")
    neg_sample = open(path + window + '/' + mode + "/n_samples", "w")
    generate_sample_seq(human_seq, mouse_seq, human_site, mouse_site, window,
                        pos_sample, neg_sample)


if __name__ == '__main__':
    main(sys.argv)
# also: write to 'exons_only.fasta/partition' if no intron file exists

import sys,os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

if len(sys.argv) < 4:
    print("Usage: python combine_alignments.py exon.fasta intron.fasta[or any value if no intron] geneName")
    sys.exit(1)
    
exon_fn = sys.argv[1]
intron_fn = sys.argv[2]
geneName = sys.argv[3]

exon_dict = SeqIO.to_dict(SeqIO.parse(exon_fn,'fasta'))
exonLength = len(next(exon_dict.itervalues()))
    
if os.path.isfile(intron_fn):
    with open("{}.combined.fasta".format(geneName),'w') as outfile:
        for seq in SeqIO.parse(intron_fn,'fasta'):
            intronLength = len(seq)
            sampleID = seq.id.split("-")[0]
            newseq = exon_dict[sampleID].seq + seq.seq
            outfile.write(">{}\n{}\n".format(sampleID,newseq))
        partition = """DNA, codon1 = 1-{}\\3
DNA, codon2 = 2-{}\\3
DNA, codon3 = 3-{}\\3
DNA, intron = {}-{}
""".format(exonLength, exonLength, exonLength, exonLength+1,exonLength+intronLength)
    with open("{}.combined.partition".format(geneName),'w') as partitionfile:
Ejemplo n.º 46
0
#!/usr/bin/python3
from Bio import SeqIO

sequences_handle = SeqIO.parse('strong_hits.fasta', 'fasta')
seq_dict = SeqIO.to_dict(sequences_handle)

remove_list = open('delete_claudins', 'r')

for next_line in remove_list:
    next_line = next_line.strip()
    del seq_dict[next_line]
    
output_file = open('strong_hits_trimmed.fasta', 'w')

for i in seq_dict:
    SeqIO.write(seq_dict[i], output_file, 'fasta')


remove_list.close()
output_file.close()
Ejemplo n.º 47
0
		annot = text[10]
		for name in names:
			if name in annot:
				dict_annot[name].append(id[:-1])
		lista_id.append(id[:-1])
			
	lista_id = list(set(lista_id))
	lista_id.sort()
	for name in dict_annot:
		li = dict_annot[name]
		li = list(set(li))
		li.sort()
		dict_annot[name] = li
	print "Loading sequences"	
	handle = open(file[:-5], "rU")
	sequen = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
	handle.close()
	print "Getting sequences..."
	for name in dict_annot:
		for el in dict_annot[name]:
			fas_1 = ">%s\n%s\n" % (el+"/1", str(sequen[el+"1"].seq))
			fas_2 = ">%s\n%s\n" % (el+"/2", str(sequen[el+"2"].seq))
			dict_name_1[name].write(fas_1)
			dict_name_2[name].write(fas_2)

	for el in lista_id:
		fas_1 = ">%s\n%s\n" % (el+"/1", str(sequen[el+"1"].seq))
		fas_2 = ">%s\n%s\n" % (el+"/2", str(sequen[el+"2"].seq))
		out_in1.write(fas_1)
		out_in2.write(fas_2)
		del sequen[el+"1"]
Ejemplo n.º 48
0
            for souche in dicoFastaSample.keys():
                #print(souche)
                if BR32ID in dicoFastaSample[souche].keys():

                    #print(souche+"\t"+basename+"\t"+MGG+"\t"+dicoFastaSample[souche][BR32ID].id)

                    if first == 0:
                        count += 1
                        try:
                            outputFile.close()
                        except:
                            pass
                        outputFile = open(pathDirectoryOut.pathDirectory +
                                          basename, "w")  # Open output file
                        #	Ouverture des sequences fasta et chargement dans dictionnaire et ecriture d'un fichier out qui contiendra les sequences renommer
                        record_dict = SeqIO.to_dict(
                            SeqIO.parse(open(filename, "rU"), "fasta"))
                        for ID, record in record_dict.items():
                            SeqIO.write(record, outputFile, "fasta")
                        first = 1
                    record = dicoFastaSample[souche][BR32ID]
                    newRecord = SeqRecord(Seq(str(record.seq)),
                                          id=souche,
                                          name=souche,
                                          description=record.id)
                    SeqIO.write(newRecord, outputFile, "fasta")
    try:
        outputFile.close()
    except:
        pass
    print(count)
                        "--sam_filename",
                        required=True,
                        help="Aligned SAM filename.")
    parser.add_argument("-g",
                        "--genome_filename",
                        required=True,
                        help="Genome fasta.")
    parser.add_argument("-o",
                        "--output_prefix",
                        required=True,
                        help="Output prefix.")
    parser.add_argument("--gff",
                        default=None,
                        help="Annotation GFF (optional).")

    args = parser.parse_args()

    # read genome
    print >> sys.stderr, "Reading genome {0}...".format(args.genome_filename)
    genome_d = SeqIO.to_dict(SeqIO.parse(open(args.genome_filename), 'fasta'))

    # read gff
    if args.gff is not None:
        print >> sys.stderr, "Reading annotation {0}...".format(args.gff)
        junction_info = read_annotation_for_junction_info(args.gff)
    else:
        junction_info = None

    evaluate_alignment_sam(args.input, args.sam_filename, genome_d,
                           args.output_prefix, junction_info)
Ejemplo n.º 50
0
 def test_order_to_dict(self):
     """Check to_dict preserves order in indexed file."""
     d = SeqIO.to_dict(SeqIO.parse(self.f, "fasta"))
     self.assertEqual(self.ids, list(d))
Ejemplo n.º 51
0
def infer_vcf(args):
    stats = Counter()
    model = models.load_model(args.weights_hd5,
                              custom_objects=models.get_all_custom_objects(
                                  args.labels))

    vcf_reader = pysam.VariantFile(args.negative_vcf, 'r')
    vcf_writer = pysam.VariantFile(args.output_vcf,
                                   'w',
                                   header=vcf_reader.header)
    print('got vcfs.')

    reference = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta"))
    print('Loaded reference FASTA:', args.reference_fasta)

    samfile = pysam.AlignmentFile(args.bam_file, "rb")
    print('got sam.')

    if args.chrom:
        intervals = {args.chrom: [int(args.start_pos), int(args.end_pos)]}
    elif args.bed_file:
        intervals = td.bed_file_to_dict(args.bed_file)
    else:
        raise ValueError(
            'What do you want to iterate over? Use arguments --bed_file or --chrom --start_pos --end_pos'
        )

    tensor_batch = np.zeros((args.batch_size, ) +
                            defines.tensor_shape_from_args(args))
    gpos_batch = []

    print(len(intervals), 'intervals to iterate over, contigs:',
          intervals.keys())
    start_time = time.time()
    for k in intervals:
        contig = reference[k]
        args.chrom = k
        for start, stop in zip(intervals[k][0], intervals[k][1]):
            cur_pos = start
            for cur_pos in range(start, stop, args.window_size):
                record = contig[cur_pos:cur_pos + args.window_size]
                t = td.make_calling_tensor(args, samfile, record, cur_pos,
                                           stats)

                if not t is None:
                    tensor_batch[stats['cur_tensor']] = t
                    gpos_batch.append((k, cur_pos, record))
                    stats['cur_tensor'] += 1

                if stats['cur_tensor'] == args.batch_size:
                    predictions = model.predict(
                        tensor_batch)  # predictions is a numpy arra
                    predictions_to_variants(args, predictions, gpos_batch,
                                            tensor_batch, vcf_writer, contig)
                    tensor_batch = np.zeros(
                        (args.batch_size, ) +
                        defines.tensor_shape_from_args(args))
                    stats['cur_tensor'] = 0
                    stats['batches_processed'] += 1
                    gpos_batch = []

                    if stats['batches_processed'] % 100 == 0:
                        elapsed = time.time() - start_time
                        t_per_minute = stats[
                            'batches_processed'] * args.batch_size / (elapsed /
                                                                      60)
                        print('At genomic position:', k, cur_pos,
                              'Tensors per minute:', t_per_minute,
                              'Batches processed:', stats['batches_processed'])
                        for s in stats.keys():
                            print(s, 'has:', stats[s])

    for s in stats.keys():
        print(s, 'has:', stats[s])
    global name_neg_file
    with open(output_dir + name_neg_file + "_" + str(p) + "_neg.bed",
              "w") as OUT:
        #		map(lambda x: OUT.write(x+"\n"),positions_neg_region)
        for feature in positions_neg_region:
            OUT.write(feature + "\n")
    OUT.close()


################################################			 MAIN				################################################
random.seed(a=seed_rand)  # seeding randomness
bin_size_list = []
print("\n" + "=" * 50 + "\n")
try:
    print("loading genome file")
    Fasta_tair = SeqIO.to_dict(SeqIO.parse(name_fasta_file, "fasta"))
    print("importing information file")
    info_regions = pybedtools.BedTool(name_bed_file)
    (positions_pos_regions_filename, Nmin,
     Nmax) = treat_positive_region(name_pos_file, name_fasta_file,
                                   name_bed_file)
    list_pos = np.genfromtxt(positions_pos_regions_filename,
                             dtype=[('chrom', 'S20'), ('start', int),
                                    ('stop', int), ('type', 'S300'),
                                    ('GC', float), ('length', int)])
    #	list_=np.unique([v for i,v in enumerate(list_pos['type'])]).tolist()
    #	for elt in list_:
    #		print(elt,len([i for i,v in enumerate(list_pos['type']) if elt == v]))
    print("cutting genome into pieces - please wait")
    genome_regions_library = {}
    (genome_regions_library,
#/usr/bin/python

from Bio import SeqIO
import sys

arg1 = sys.argv[1]
arg2 = sys.argv[2]
arg3 = sys.argv[3]

filename = arg1
reads_dict = SeqIO.to_dict(SeqIO.parse(arg2, "fasta"))
with open(arg3, "w") as output_file:
    n = 0
    for record in SeqIO.parse(filename, "fasta"):
        if record.id in reads_dict:
            #n=n+1
            SeqIO.write(record, output_file, "fasta")
        else:
            #SeqIO.write(record, output_file, "fasta")
            n = n + 1

print(n)
Ejemplo n.º 54
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from Bio import SeqIO
import sys

mapfolder = ''

#%%
filenumber = sys.argv[1]
fqread1 = SeqIO.to_dict(
    SeqIO.parse(mapfolder + 'Split1-' + str(filenumber), 'fastq'))
fqread2 = SeqIO.to_dict(
    SeqIO.parse(mapfolder + 'Split2-' + str(filenumber), 'fastq'))

sublib = pd.read_pickle('../../code/design/LIBRARY/threelibrary210.pkl')
sublib['barcode'] = sublib.varseq.apply(lambda x: x[18:30])
readsmap = pd.Series([''], index=sublib.index)

liftfor = 'TGCGAGTTAGGGGACGGT'
#upstream='ACTAGTTTACGACGGGTT'

for read in fqread1.keys():
    if (fqread1[read].seq.find(liftfor) > -1):
        testbc = fqread1[read].seq[fqread1[read].seq.find(liftfor) +
                                   18:fqread1[read].seq.find(liftfor) + 30]
        readsmap.loc[sublib[sublib.barcode == testbc].index] = readsmap.loc[
            sublib[sublib.barcode == testbc].index] + ' ' + [
                str(fqread2[read].seq)
Ejemplo n.º 55
0
import matplotlib.ticker as tck

import mpl_toolkits.axisartist as axisartist
from matplotlib.patches import Rectangle


def check_location_intersection(location1, location2):
    if location1.start in location2 or location1.end in location2:
        return True
    if location2.start in location1 or location2.end in location1:
        return True
    return False

annotation_file = "/home/mahajrod/Reference_genomes/drosophila_melanogaster/r6.03/gtf/sbr.gtf"
with open(annotation_file, "r") as ann_fd:
    annotations_dict = SeqIO.to_dict(GFF.parse(ann_fd))

CAGE_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/CAGE/GSE66284/"
annotation_file = "/home/mahajrod/Reference_genomes/drosophila_melanogaster/r6.03/gtf/sbr_locus.gtf"
description_file = CAGE_dir + "good_samples_description.t"
length_file = "length_sam_file.len"
histo_file = "nxf_region_filtered_sorted_filtered.histo"

with open(annotation_file, "r") as ann_fd:
    annotations_dict = SeqIO.to_dict(GFF.parse(ann_fd))

samples_description_dict = {"sample": ["SRR488285",
                                       #"SRR488282",
                                       #"SRR488271",
                                       "SRR488272"],
                            "description": ["Adult Mated Male 4 days Post-eclosion Testes",
Ejemplo n.º 56
0
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None):
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    for seqId in dnaSeqDict:
        if not len(dnaSeqDict[seqId].seq):
            #del(dnaSeqDict[seqId])
            LOG.write("warning: seqId %s length of dna was zero\n" % seqId)
    dnaSeqRecords = []
    for proteinSeq in proteinAlignment:
        dnaSeqRecords.append(dnaSeqDict[proteinSeq.id])

    if Debug:
        LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords)))
        #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict))))
        #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict))))
        #LOG.write("first two aligned DNA seqs:\n")
        #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta")
        #LOG.flush()
    """
    # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons
    for i, protRec in enumerate(proteinAlignment):
        protSeq = str(protRec.seq)
        protSeq.replace('-','')
        protLen = len(protSeq)
        if len(dnaSeqs[i].seq) < protLen*3:
            shortfall = (protLen*3) - len(dnaSeqs[i].seq)
            if Debug:
                LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall))
            # extend on both ends to be safe
            dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall
    """
    returnValue = None
    #with warnings.catch_warnings():
    #warnings.simplefilter('ignore', BiopythonWarning)
    #try:
    #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'}
    #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'}
    #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values)
    #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000)
    returnValue = codonalign.build(pro_align=proteinAlignment,
                                   nucl_seqs=dnaSeqRecords,
                                   max_score=1000)
    for dnaSeq in returnValue:
        proteinRecord = protSeqDict[dnaSeq.id]
        if proteinRecord.annotations:
            dnaSeq.annotations = proteinRecord.annotations.copy()

        #except Exception as e:
        #    LOG.write("problem in codonalign, skipping\n%s\n"%str(e))
        #    raise(e)
    return returnValue
Ejemplo n.º 57
0
    aln_out = group + '.aln'
    run_mafft_commandline(args, seq_outf, aln_out)
    remove(seq_outf)
    # check gap percentage
    gap_perc = gaps_in_alignment(aln_out)
    if gap_perc < args.gap:
        used_group.append(aln_out)
    else:
        print('{0} was discarded due to higher gaps {1:5.2f}% (>= {2:5.2f}%)'.format(aln_out, gap_perc, args.gap))
        remove(aln_out)

if (args.con and (len(used_group) > 1)) or args.orthogrp:
    # concatenate all single gene alignments
    conseq_file = 'concatenated_seqs_%d.aln' % len(used_group)
    print('Concatenating %d alignments into one file (%s)' % (len(used_group), conseq_file))
    seqs = SeqIO.to_dict(SeqIO.parse(used_group[0], 'fasta'))
    remove(used_group[0])
    for seq_file in used_group[1:]:
        other_seqs = SeqIO.to_dict(SeqIO.parse(seq_file, 'fasta'))
        for seq_id in seqs:
            seqs[seq_id].seq += other_seqs[seq_id].seq
        remove(seq_file)
    SeqIO.write(seqs.values(), conseq_file, 'fasta')
    used_group = [conseq_file,]

#### D. using the alignment(s) to make tree(s)
for idx, group in enumerate(used_group):
    out_file = group.split('.')[0] + '.tree'
    idx += 1
    print('Making a tree to %s (%d/%d)' % (out_file, idx, len(used_group)))
    run_fasttree_commandline(args, group, out_file)
Ejemplo n.º 58
0
parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    help="Genbank file with annotations")
parser.add_argument(
    "--fast_parsing",
    action="store_true",
    dest="fast_parsing",
    help="Fast parsing mode - high memory consumption. Default: false")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    default="output",
                    help="Prefix of output files")

args = parser.parse_args()

record_dict = SeqIO.to_dict(SeqIO.parse(
    args.input, format="genbank")) if args.fast_parsing else SeqIO.index_db(
        "temp_index.idx", [args.input], format="genbank")

SequenceRoutines.get_protein_marking_by_exons_from_genbank(
    record_dict,
    args.output_prefix,
    protein_id_field_in_cds_feature="protein_id")

#os.remove("temp_index.idx")
def get_DR_basePair_freq(matF, FastaFile, threshold, factorTranscription,
                         Interdistance_maxValue, matRev):
    # This line allows to retrieve all the sequences from the fasta file
    sequences = SeqIO.to_dict(SeqIO.parse(FastaFile, "fasta"))

    print "  There are %s sequence(s) to analyze" % (len(sequences))
    allScoresPos = []
    allScoresNeg = []
    sens = ""
    # We will store in these lists all the occurences of each kind of interdistances between motifs found in all sequences.
    DRmotif = []
    DR = [0] * (Interdistance_maxValue + 1)
    ER = [0] * (Interdistance_maxValue + 1)
    IR = [0] * (Interdistance_maxValue + 1)
    index = 0
    #print("dep : ",dep)
    # We look at all the fasta sequences:
    for s in sequences:
        # We will store in this list all the best scores (see the threshold after) found for subsequences of one sequence
        #if type(threshold) is list:
        good_score_positions = []
        bestScore = 0
        positionOfTheBestScore = 0
        # This line allows to retrieve the DNA sequence
        seq = sequences[s].seq
        id = sequences[s].id

        # We look at each sub-sequences of the whole sequence. Each sub-sequence has the same length that the matrix length.
        for c in range(len(seq) - (lenMotif - 1)):
            strandPos = seq[c:c + lenMotif].upper()
            #print("strandPos : ",strandPos)
            test = 0
            for nu in strandPos:
                if nu not in ["A", "C", "G", "T"]:
                    test = 1
            if test == 1:
                score = "NA"
            else:
                n = 0
                #These lines allows to calculate a score for one sub-sequence
                scoreStrandPos = 0
                scoreStrandNeg = 0
                while n < lenMotif:
                    if strandPos[n] == 'A':
                        scoreStrandPos = scoreStrandPos + matF[n * 4]
                        scoreStrandNeg = scoreStrandNeg + matRev[n * 4]
                    elif strandPos[n] == 'C':
                        scoreStrandPos = scoreStrandPos + matF[n * 4 + 1]
                        scoreStrandNeg = scoreStrandNeg + matRev[n * 4 + 1]
                    elif strandPos[n] == 'G':
                        scoreStrandPos = scoreStrandPos + matF[n * 4 + 2]
                        scoreStrandNeg = scoreStrandNeg + matRev[n * 4 + 2]
                    elif strandPos[n] == 'T':
                        scoreStrandPos = scoreStrandPos + matF[n * 4 + 3]
                        scoreStrandNeg = scoreStrandNeg + matRev[n * 4 + 3]
                    n += 1

                #These lines allows to retrieve the position and the strand where there is a predicted binding site.
                #You can change the threshold.
                if scoreStrandPos > threshold:
                    #good_score_positions.append([c+1,">",scoreStrandPos,str(strandPos[0:1]),str(strandPos[1:2]),str(strandPos[2:3]),str(strandPos[3:4]),str(strandPos[4:5]),str(strandPos[5:6]),str(strandPos[6:7]),str(strandPos[7:8]),str(strandPos[8:9]),str(strandPos[9:10]),index])
                    good_score_positions.append([
                        c + 1, ">", scoreStrandPos,
                        str(strandPos[3:10]), index
                    ])
                    #good_score_positions.append([c+1,">",scoreStrandPos,str(strandPos[0:10])+str(seq[c+lenMotif:c+lenMotif+1+lenMotif]),index])
                if scoreStrandNeg > threshold:
                    good_score_positions.append([
                        c + 1, "<", scoreStrandPos,
                        str(strandPos[0:7]), index
                    ])

        # Once we have stored all the positions, we calculate all the interdistances:
        for first in range(0, len(good_score_positions) - 1):
            firstSubSeq = good_score_positions[first]
            for second in range(first + 1, len(good_score_positions)):
                secondSubSeq = good_score_positions[second]
                if factorTranscription == "ARF2":
                    if firstSubSeq[1] == ">" and secondSubSeq[1] == ">":
                        d = (int(secondSubSeq[0]) + 2) - (int(firstSubSeq[0]) +
                                                          lenMotif - 2)
                        if d == 8:
                            DRmotif.append(firstSubSeq)
                if factorTranscription == "ARF5":
                    if firstSubSeq[1] == ">" and secondSubSeq[1] == ">":
                        d = (int(secondSubSeq[0]) + 3) - (int(firstSubSeq[0]) +
                                                          lenMotif - 1)
                        #if d == 5 :
                        #DRmotif.append(firstSubSeq)
                    if firstSubSeq[1] == "<" and secondSubSeq[1] == ">":
                        d = (int(secondSubSeq[0]) + 3) - (int(firstSubSeq[0]) +
                                                          lenMotif - 3)
                        if d == 0:
                            DRmotif.append(
                                str(firstSubSeq[3]) + str(secondSubSeq[3]))
        index = index + 1
    #print("DRmotif : ",DRmotif)
    #DRmotif_without_doublon = [list(i) for i in set(map(tuple, DRmotif))]
    DRbp1 = []
    DRbp2 = []
    DRbp3 = []
    DRbp4 = []
    DRbp5 = []
    DRbp6 = []
    DRbp7 = []
    DRbp8 = []
    DRbp9 = []
    DRbp10 = []
    #for i in DRmotif_without_doublon :
    #DRbp1.append(i[3])
    #DRbp2.append(i[4])
    #DRbp3.append(i[5])
    #DRbp4.append(i[6])
    #DRbp5.append(i[7])
    #DRbp6.append(i[8])
    #DRbp7.append(i[9])
    #DRbp8.append(i[10])
    #DRbp9.append(i[11])
    #DRbp10.append(i[12])
    return (DRbp1, DRbp2, DRbp3, DRbp4, DRbp5, DRbp6, DRbp7, DRbp8, DRbp9,
            DRbp10, DRmotif)
Ejemplo n.º 60
0
    stdout=subprocess.PIPE)
temp2.write(p.communicate()[0])
temp.close()
temp2.flush()
temp2.seek(0)

listToMerge = []
lastLine = []
fusion = False
losgehts = False
if not strandSpecific:
    sys.stderr.write(
        'read fasta records into memory (for strand unspecific protocols only).  \n'
    )
    handle = open(refGenome, "rU")
    record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
    handle.close()
sys.stderr.write('process the sorted split reads.  \n')
for line in temp2.readlines():
    columns = line.strip().split('\t')
    if (losgehts):
        if (columns[0] == lastLine[0]):
            listToMerge.append(columns)
        else:
            listofmappings = []
            listofmappings = getmapping(listToMerge)
            for i in range(0, len(listofmappings)):
                transcripts += 1
                if (len(listofmappings[i]) > 2):
                    transcriptsMore += 1
                fusion = isFusion(listofmappings[i])