Esempio n. 1
0
            # Expected to fail
            pass

    # Show the alignment
    for i, alignment in enumerate(alignments):
        if i < 3 or i + 1 == t_count:
            print(" Alignment %i, with %i sequences of length %i" \
                  % (i,
                     len(alignment),
                     alignment.get_alignment_length()))
            print(alignment_summary(alignment))
        elif i == 3:
            print(" ...")

    # Check AlignInfo.SummaryInfo likes the alignment
    summary = AlignInfo.SummaryInfo(alignment)
    dumb_consensus = summary.dumb_consensus()
    #gap_consensus = summary.gap_consensus()
    if t_format != "nexus":
        # Hack for bug 2535
        pssm = summary.pos_specific_score_matrix()
        rep_dict = summary.replacement_dictionary()
        try:
            info_content = summary.information_content()
        except ValueError as e:
            if str(
                    e
            ) != "Error in alphabet: not Nucleotide or Protein, supply expected frequencies":
                raise e
            pass
Esempio n. 2
0
def align_by_phylotype(input_fasta,input_cluster,input_metadata,output_folder):
    metadata_dic = {}
    phylotype_dic = {}
    seq_dic = {}
    consensus_dic = {}

    with open(input_metadata,"r") as f:
        reader = csv.DictReader(f)
        metadata = [r for r in reader]

    for items in metadata:
        metadata_dic[items["header"]] = items["lineage"]

    for record in SeqIO.parse(input_fasta, 'fasta'):
        seq_dic[record.id]= record.seq

    with open(input_cluster,"r") as f:
        for line in f:
            phylotype_dic[line.rstrip()] = []

    for cluster in phylotype_dic.keys():
        for seq_id,phylotype in metadata_dic.items():
            cluster_type = cluster.split(".")
            cluster_length = len(cluster_type)
            phylo_type = phylotype.split(".")
            if len(phylo_type) < cluster_length:
                continue
            if phylo_type[:cluster_length] == cluster_type:
                if seq_id in seq_dic.keys():
                    phylotype_dic[cluster].append([seq_id,seq_dic[seq_id],phylotype])
                    del seq_dic[seq_id]

    print("Clade","Number of Sequences")
    for key,value in phylotype_dic.items():
        print(key,len(value))

    log_file = open(output_folder+"align_phylo.log","w")
    for seq in seq_dic.keys():
        log_file.write("Sequence " + seq + " with lineage " + metadata_dic[seq] +
        " did not fall into any of the phylotypes stated in the cluster file.\n")

    for key in phylotype_dic.keys():
        if len(phylotype_dic[key]) > 2:
            outfile_name = output_folder + "lineage_" + key + ".fasta"
            outfile = open(outfile_name,"w")
            for sequences in phylotype_dic[key]:
                record = SeqRecord(sequences[1],id=sequences[0],description="")
                SeqIO.write(record, outfile, "fasta")
            outfile.close()
            alignment_name = outfile_name[:-6] + "_alignment.fasta"
            align_command = "mafft " + outfile_name + " > " + alignment_name
            os.system(align_command)
            os.remove(outfile_name)
            alignment = AlignIO.read(alignment_name, 'fasta')
            consensus_name = key + "_consensus"
            summary_align = AlignInfo.SummaryInfo(alignment)
            consensus_seq = summary_align.dumb_consensus(threshold=0.0,ambiguous='N')
            consensus_dic[consensus_name] = consensus_seq
        else:
            log_file.write("Phylotype " + key + "does not have 2 or more sequences for an alignment to work.")
    log_file.close()

    consensus_file = open(output_folder+"lineage_consensus.fasta","w")
    for key, value in consensus_dic.items():
        record = SeqRecord(value,id=key,description="")
        SeqIO.write(record, consensus_file, "fasta")
    consensus_file.close()
Esempio n. 3
0
def buildGSSP( vgene ):

	results = []

	if len(masterList[vgene]) < arguments["--numSequences"]:
		print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) )
		return []
		
	if vgene not in germList:
		print( "Skipping %s, it's not in the germline database..." %vgene )
		return []

	# Take random overlapping subsets to generate multiple profiles
	#  need to add back a sanity check for capping the number of subsets if there's not enough raw data.
	numProfiles = arguments['--profiles']
	if arguments["--profiles"] == 0:
		numProfiles = 1

	success = 0
		
	for i in range(numProfiles):
		seqs = [] + germList[vgene] #force a copy rather than an alias
		if arguments["--profiles"] == 0:
			seqs += list(masterList[vgene])
		else:
			#get our sequence subset, add the germlines, and write them
			#   to a temporary file for alignment
			seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False))

		tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene)
		with open("%s.fa"%tempFile, "w") as temp:
			SeqIO.write(seqs,temp,"fasta")

		muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile)

		#try to speed up the process a little bit for large datasets
		#still going to max out at ~50k seqs per profile (probably)
		muscle_cline.maxiters	= 2
		muscle_cline.diags	= True

		try:
			stdout, stderr = muscle_cline()
		except:
			print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) )
			for f in glob.glob("%s.*"%tempFile): 
				os.remove(f)
			continue

		alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal")
		success += 1

		#Input order is not maintained, so we need a little
		#   kludge to find a germline sequences. Use the 
		#   first one to remove any insertions from the alignment
		germRow = 0
		for n, rec in enumerate(alignment):
			if rec.id in [g.id for g in germList[vgene]]:
				germRow = n
				break

		#look for gaps one at a time so we don't get tripped up by shifting indices
		gap = re.search( "-+", str(alignment[germRow].seq) )
		while (gap):
			alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():]
			gap = re.search( "-+", str(alignment[germRow].seq) )
		
		#Now we get BioPython to make a PSSM for us. To convert that into
		#    a mutability profile, we will delete the germline residue[s]
		#    at each position (but save what they were)
		germRes = defaultdict(Counter)
		summary_align = AlignInfo.SummaryInfo(alignment)
		pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X'])

		#get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data
		# do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues.
		denominator = []
		for p,pos in enumerate(pssm):
			denominator.append( sum(pos.values()) - len(germList[vgene]) )
    
		for germ in germList[vgene]:
			for pos, residue in enumerate(germ):
				if residue == "X":
					continue
				germRes[pos][residue] += 1
				pssm[pos][residue] = 0

		#normalize and save
		for p, pos in enumerate(pssm):
			germAA = ",".join([ x[0] for x in germRes[p].most_common() ])
			results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] )
	    
		#clean up
		for f in glob.glob("%s.*"%tempFile): 
			os.remove(f)

	print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) )
	return results
def generateIntron2Consensus(alleleFullList, outputDirectory):
    # TODO: This method does not seem to work anymore. I am not assigning the in2Sequence anywhere.
    # Do I need this code anymore? Why would I want to simulate an Intron 2 consensus sequence?
    for featureName in ['Intron 2']:
        shortFeatureName = featureName.replace(' ', '')
        
        #Im deciding to quit here.  Late enough.  I want to fix this method tomorrow.
        
        print ('Creating a ' + featureName + ' Reference:' + join( join(outputDirectory,shortFeatureName + 'References'), 'HLA_Intron2.fasta') )
        
        intron2Alleles = []
    
        for allele in alleleFullList:
            #TODO fix featuresInFullSequence.  Might work this way.
            if('Intron 2' in allele.featuresInFullSequence):
         
                currentIntron2Allele = allele.copy()
                #TODO I don't know if i'm still gonna use in2Sequence.
                currentIntron2Allele.sequence = allele.in2Sequence
                intron2Alleles.append(currentIntron2Allele)
    
        # Intron 2 output file, for analyizing *just* the intron 2
        outputIn2FileName = join(join(outputDirectory,'Intron2References'), 'HLA_Intron2.fasta')
        printFasta(intron2Alleles, outputIn2FileName, False, False, False)
    
    
        # Print outputfiles and info for each allele group.
        print ('Generating output files for each HLA Allele Group')
        alleleGroups = getAlleleGroups(intron2Alleles)
        alleleGenes = getAlleleGenes(intron2Alleles)
        combinedAlleleGroups = alleleGroups + alleleGenes
        for index, alleleGroup in enumerate(combinedAlleleGroups):
            
            print('(' + str(index + 1) + '/' + str(len(combinedAlleleGroups)) + '): ' + alleleGroup.FileName)
    
            outputGroupFileName = join(outputDirectory, 
                join('Intron2References',alleleGroup.FileName))
    
            clustalwAlignmentOutputFileName = outputGroupFileName.replace('.fasta','.aln')
            clustalwConsensusOutputFileName = outputGroupFileName.replace('.fasta','.consensus.fasta')
            # if there is more than one allele in the group
            if (len(alleleGroup.Alleles) > 1):
                print (str(len(alleleGroup.Alleles)) + ' Alleles Found.  Writing to file: ' + outputGroupFileName)
    
                # Print allele group to a fasta file
                # So this should actually be a false, I don't want to use the APD sequence here.
                printFasta(alleleGroup.Alleles, outputGroupFileName, False, False, False)
    
                if (not os.path.isfile(clustalwAlignmentOutputFileName)): 
                    clustalwCommandline = ClustalwCommandline("clustalw", infile=outputGroupFileName, outfile=clustalwAlignmentOutputFileName)
                    print ('Performing  ClustalW Alignment : \n' + str(clustalwCommandline))
    
                    #Perform the alignment
                    clustalwCommandline()
        
                    # sanity check to make sure it exists.
                    if (os.path.isfile(clustalwAlignmentOutputFileName)):  
                        # If consensus does not exist yet
                        if not (os.path.isfile(clustalwConsensusOutputFileName)):  
                            #Find consensus
                            alignmentType = 'clustal'    
                            align = AlignIO.read(clustalwAlignmentOutputFileName, alignmentType)
                        
                            print ('Consensus FileName = ' + clustalwConsensusOutputFileName)
                        
                            #print('Alignment:' + str(align))
                            summary_align = AlignInfo.SummaryInfo(align)
    
                            dumb_consensus = summary_align.dumb_consensus()
                            #print('LengthDumbConsensus:' + str(len(dumb_consensus)))
                            gap_consensus = summary_align.gap_consensus()
                            #print('LengthGapConsensus:' + str(len(gap_consensus)))
        
                            # Print Consensus to fasta.    
                            # I can cheat and just create an HLA_Allele object, and print that.
                            currentAllele = HLA_Allele()
                            currentAllele.APDSequence = str(dumb_consensus)
                            currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','')
                            currentAllele.outputDirectory = outputDirectory
                            #print ('Consensus2=' + currentAllele.APDSequence)
                            printFasta([currentAllele], clustalwConsensusOutputFileName, True, False, False)
        
                            pass
                        else:
                            print ('Consensus file ' + clustalwConsensusOutputFileName + ' already exists.  Moving on...')
                    else:
                        print ('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) 
                        #raise Exception('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) 
                        pass
    
                else:
                    print('This alignment file ' + clustalwAlignmentOutputFileName + ' already exists.  Moving on...')
                #else:
                #    print ('Not running Alignments because you told me not to.')   
    
            # There is only one allele in this group.
            else:
                print ('Only one allele found. Writing to file: ' + outputGroupFileName)
    
                #writing it out twice, that's kind of silly but whatever.
                printFasta([alleleGroup.Alleles[0]], outputGroupFileName, True, False, False)
                printFasta([alleleGroup.Alleles[0]], clustalwConsensusOutputFileName, True, False, False)
def closestToConsensus(linIt):

    results = []
    print("Starting vsearch on a new chunk...")

    for lineage in linIt:
        #save time on singletons (if they weren't excluded by minSeq)
        if len(lineage['desc']) == 1:
            with open("%s/%s.fa" % (prj_tree.lineage, lineage['name']),
                      "r") as handle:
                results.append(SeqIO.read(handle, 'fasta'))
        else:

            FNULL = open(
                os.devnull,
                'w')  #don't clutter up output with tons of vsearch messages

            #cluster and rapid align with vsearch
            subprocess.call([
                vsearch, "-cluster_size",
                "%s/%s.fa" % (prj_tree.lineage, lineage['name']), "-id",
                "0.97", "-sizein", "-sizeout", "-msaout",
                "%s/%s_msa.fa" %
                (prj_tree.lineage, lineage['name']), "-clusterout_sort"
            ],
                            stdout=FNULL,
                            stderr=subprocess.STDOUT)

            #extract biggest cluster
            with open("%s/%s_msa.fa" % (prj_tree.lineage, lineage['name']),
                      "r") as allClusters:
                with open(
                        "%s/%s_msaBiggest.fa" %
                    (prj_tree.lineage, lineage['name']), "w") as biggestOnly:
                    blank = next(allClusters)
                    for line in allClusters:
                        if "consensus" in line:
                            break
                        biggestOnly.write(line)

            #open the msa
            with open(
                    "%s/%s_msaBiggest.fa" %
                (prj_tree.lineage, lineage['name']), "r") as handle:
                aln = AlignIO.read(handle, "fasta")

            #add derep size to alignment as weighting
            for rec in aln:
                rec.annotations['weight'] = int(
                    rec.id.split(";")[1].split("=")[1])

            summary_align = AlignInfo.SummaryInfo(aln)
            pssm = summary_align.pos_specific_score_matrix()

            #score each sequence and save the best one
            scores = dict()
            for record in aln:
                myScore = 0
                for i, l in enumerate(record):
                    myScore += pssm[i][l]
                scores[record.id] = myScore

            d = sorted(aln, key=lambda rec: scores[rec.id],
                       reverse=True)  #reverse -> get max
            d[0].seq = d[0].seq.ungap("-")  #remove gaps
            d[0].id = d[0].id.split(";")[0]  #remove vsearch size annotation
            d[0].id = re.sub(
                "^\*", "",
                d[0].id)  #get rid of possible annotation from vsearch
            d[0].description = lineage['desc'][d[0].id]  #restore original info

            results.append(d[0])

    return results
Esempio n. 6
0
 def get_consensus(self):
     '''Outpus consensus sequence of the cluster'''
     summary_align = AlignInfo.SummaryInfo(self.msa)
     return summary_align.dumb_consensus()
Esempio n. 7
0
def germline_from_imgt(sequence_file, germline_lib, species_name, output_file, option, report, error, fixed_mut):
    """
    Test cases mainly for Mutation Analysis:
    Identical sequences with varying lengths at the 5' end
    >>> germline_from_imgt("Testfiles/alleles/varyinglengths.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/varyinglengths_out.fasta", "v", doctest_report, doctest_report, 3)
    Processing successfully completed.
    Mutation Analysis:
    IGHV4-34*01 (5 sequences):
     Common mutations: c67g, g88c, t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t
    germline:  caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg
    consensus: .........ctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg

    Identical sequences with one common deletion and an adjacent deletion that is not present in all sequences
    >>> germline_from_imgt("Testfiles/alleles/somedeletions.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/somedeletions_out.fasta", "v", doctest_report, doctest_report, 3)
    Processing successfully completed.
    Mutation Analysis:
    IGHV4-34*01 (5 sequences):
     Common deletions: 156, 157, 158
     Common mutations: c67g, g88c, t96g, a97c, g103c, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t
    germline:  caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg
    consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcXXX---agtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg

    Identical sequences with a common deletion
    >>> germline_from_imgt("Testfiles/alleles/deletions.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/deletions_out.fasta", "v", doctest_report, doctest_report, 3)
    Processing successfully completed.
    Mutation Analysis:
    IGHV4-34*01 (5 sequences):
     Common deletions: 153, 154, 155, 156, 157, 158
     Common mutations: c67g, g88c, t96g, a97c, g103c, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t
    germline:  caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg
    consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatc------agtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg

    Test with 5 identical sequences
    >>> germline_from_imgt("Testfiles/alleles/identical.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/identical_out.fasta", "v", doctest_report, doctest_report, 3)
    Processing successfully completed.
    Mutation Analysis:
    IGHV4-34*01 (5 sequences):
     Common mutations: c67g, g88c, t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t
    germline:  caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg
    consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg

    Test with two mutations removed from the first sequence
    >>> germline_from_imgt("Testfiles/alleles/two_dropped.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/two_dropped_out.fasta", "v", doctest_report, doctest_report, 3)
    Processing successfully completed.
    Mutation Analysis:
    IGHV4-34*01 (5 sequences):
     Common mutations: t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t
    germline:  caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg
    consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgXtgtctatggtgggtccttcaXtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg
    """
    for char in option:
        if char not in 'ciofvjx':
            error('unrecognised option: %s.' % char)
            return
        
    try:
        gl = Germlib(species_name, germline_file=germline_lib)
    except:
        report("Error parsing germline library file: " + str(sys.exc_info()[1]))
        return

    consensus_f = []
    consensus_v = []
    consensus_j = []
    mutated_germs = {}
    
    imgt_nt = {}
    try:
        with open(sequence_file, "r") as sequence_handle:
            ln = sequence_handle.readline()
            sep = ("\t" if "\t" in ln else ",")
            sequence_handle.seek(0)
            reader = csv.DictReader(sequence_handle, delimiter=sep)
            for row in reader:
                imgt_nt[row["Sequence ID"]] = row
    
        outrecs = []
        for id, nt_rec in imgt_nt.iteritems():
            try:
                if "JUNCTION" in nt_rec and nt_rec["JUNCTION"] != None and len(nt_rec["JUNCTION"]) > 0:
                    heavychain = len(nt_rec["V-D-J-REGION"]) > 0
        
                    if heavychain:
                        mAb = (nt_rec["V-REGION"], 
                               nt_rec.get("P3'V", ""), 
                               nt_rec.get("N-REGION", ""), 
                               nt_rec.get("N1-REGION", ""), 
                               nt_rec.get("P5'D", ""),
                               nt_rec.get("D-REGION", ""), 
                               nt_rec.get("P3'D", ""), 
                               nt_rec.get("N2-REGION", ""), 
                               nt_rec.get("P5'J", ""), 
                               nt_rec["J-REGION"])
                    else:
                        mAb = (nt_rec["V-REGION"], 
                               nt_rec.get("P3'V", ""), 
                               nt_rec.get("N-REGION", ""), 
                               nt_rec.get("P5'J", ""), 
                               nt_rec["J-REGION"])
        
                    if 'x' in option:
                        report("%s:" % id)
                        report(" | ".join(mAb))
        
                    # Revert the part of the V-gene that extends to the second Cysteine
                    vregion = nt_rec["V-REGION"]
                    vregion_3prime = nt_rec["3'V-REGION"]
                    vgene_name = Germlib.translate_imgt_name(nt_rec["V-GENE and allele"])
        
                    if len(vregion_3prime) > 0 and vregion[0 - len(vregion_3prime):] != vregion_3prime:
                        report("Error: 3'V-REGION sequence not found at 3' end of V-REGION in sequence %s" % id)
                        continue
        
                    # Remove stray nucleotides from the 5' end of the V-region to give us whole codons (we know the 3' end is aligned)
                    vregion_5prime = vregion[:0 - len(vregion_3prime)] if len(vregion_3prime) > 0 else vregion
                    vregion_5prime = (vregion_5prime if len(vregion_5prime) % 3 == 0 else vregion_5prime[(len(vregion_5prime) % 3):])
        
                    try:
                        vgene_frag1, matchstr_frag1 = gl.match_from_aa(vgene_name, vregion_5prime)
        
                        # For the remaining (3') part, use a global alignment. We use the entire V-region so that the 3prime
                        # region, which might be quite small, aligns against the right part of the sequence
                        vgene_frag2, matchstr_frag2 = gl.match(vgene_name, vregion)[0 - len(vregion_3prime):] if len(vregion_3prime) > 0 else ("", "")
                        
                        if fixed_mut > 0:
                            # Merge the two matchstrings. Starting at the 3' end, we pull matchstring off frag2 until we get beyond vgene_frag2 and are
                            # about to pull the first nt of vgene_frag1. Then we pull the rest off vgene_frag1.
                            
                            mlen = 0
                            matchstr = ""
                            for m in matchstr_frag2[::-1]:
                                if m != 'd':
                                    mlen += 1
                                    if mlen > len(vregion_3prime):
                                        break
                                matchstr += m
                                
                            skip = True
                            for m in matchstr_frag1[::-1]:
                                if skip and m != 'd':
                                    skip = False
                                if not skip:
                                    matchstr += m
                            
                            matchstr = matchstr[::-1]
                            
                            # Sanity check 1 - number of nucleotides in match string should match length of v-region
                            mlen = sum((n != 'd') for n in matchstr)
                            if len(vregion_5prime) + len(vregion_3prime) != mlen:
                                report("Error in match string length for sequence %s" % id)
                                
                            # Sanity check 2 - check matchstring is consistent
                            
                            vgene = str(gl.seq(vgene_name).seq)
                            mismatch = False
                            gt = iter(vgene)
                            vt = iter(vregion)
                            for m in matchstr:
                                if m == 'd':
                                    next(gt)
                                elif m == 'i':
                                    next(vt)
                                elif m == 'm':
                                    if next(gt) != next(vt):
                                        mismatch = True
                                else:
                                    if next(gt) == next(vt):
                                        mismatch = True
                                        
                            if mismatch:
                                report("Error in matchstring for sequence %s:\nvgene: %s\nseq  :  %s\nmatch: %s\n" % (id, vgene, vregion, matchstr))
                            else:
                                en = mutated_germs.get(vgene_name, [])
                                en.append((vregion, matchstr))
                                mutated_germs[vgene_name] = en
                                
                        if nt_rec["J-GENE and allele"] != '':
                            jgene_name = Germlib.translate_imgt_name(nt_rec["J-GENE and allele"])
                            jgene_frag, _ = gl.match(jgene_name, nt_rec["J-REGION"])
                        else:
                            jgene_frag = ''
        
                        if heavychain and nt_rec["D-GENE and allele"] != '':
                            dgene_name = Germlib.translate_imgt_name(nt_rec["D-GENE and allele"])
                            dgene_frag, _ = gl.match(dgene_name, nt_rec["D-REGION"])
                        else:
                            dgene_frag = ''
                    except:
                        report("Error processing sequence " + id + ":")        
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2))
                        continue
        
                    if heavychain:
                        germline = [
                            vgene_frag1 + vgene_frag2, 
                            nt_rec.get("P3'V", ""), 
                            nt_rec.get("N-REGION", ""), 
                            nt_rec.get("N1-REGION", ""), 
                            nt_rec.get("P5'D", ""),
                            dgene_frag, 
                            nt_rec.get("P3'D", ""), 
                            nt_rec.get("N2-REGION", ""), 
                            nt_rec.get("P5'J", ""), 
                            jgene_frag]
                    else:
                        germline = [
                            vgene_frag1 + vgene_frag2, 
                            nt_rec.get("P3'V", ""), 
                            nt_rec.get("N-REGION", ""), 
                            nt_rec.get("P5'J", ""), 
                            jgene_frag]
        
                    jgene_frag = (jgene_frag if len("".join(germline)) % 3 == 0 else jgene_frag[:0-(len("".join(germline)) % 3)])
                    germline[-1] = jgene_frag
        
                    if 'i' in option:                
                        trunc5 = len(vregion) - len(vregion_5prime + vregion_3prime)
                        if heavychain:
                            trunc3 = (len(nt_rec["V-D-J-REGION"]) - trunc5) % 3
                            if trunc3 != 0:
                                outrecs.append(SeqRecord(Seq(nt_rec["V-D-J-REGION"][trunc5:0-trunc3]), id=id, name=id, description=""))
                            else:
                                outrecs.append(SeqRecord(Seq(nt_rec["V-D-J-REGION"][trunc5:]), id=id, name=id, description=""))
                        else:
                            trunc3 = (len(nt_rec["V-J-REGION"]) - trunc5) % 3
                            if trunc3 != 0:
                                outrecs.append(SeqRecord(Seq(nt_rec["V-J-REGION"][trunc5:0-trunc3]), id=id, name=id, description=""))
                            else:
                                outrecs.append(SeqRecord(Seq(nt_rec["V-J-REGION"][trunc5:]), id=id, name=id, description=""))
        
                    if 'f' in option:
                        if 'x' in option:
                            report("Inferred 'full' germline:")
                            report(" | ".join(germline))
                        sr = SeqRecord(Seq("".join(germline)), id=id + "_germ", name=id + "_germ", description="")
                        consensus_f.append(sr)
                        if 'o' in option:
                            outrecs.append(sr)
        
                    def chunks(l, n):
                        """ Yield successive n-sized chunks from l."""
                        for i in xrange(0, len(l), n):
                            yield l[i:i + n]
        
                    germline = "".join(germline)
                    v_ext = vgene_frag1 + vgene_frag2
        
                    if 'v' in option:
                        g = (v_ext) + '-' * (len(germline) - len(v_ext))
                        germline_v = ""
                        for c in chunks(g, 3):
                            germline_v += c if '-' not in c else '-'*len(c)
            
                        if 'x' in option:
                            report("Inferred germline (v):")
                            report(germline_v)
                        sr = SeqRecord(Seq(germline_v), id=id + "_germ_v", name=id + "_germ_v", description="")
                        consensus_v.append(sr)
                        if 'o' in option:
                            outrecs.append(sr)
        
                    if 'j' in option:
                        if heavychain:
                            g = v_ext + '-' * (
                                len(nt_rec.get("P3'V", "")) + 
                                len(nt_rec.get("N-REGION", "")) + 
                                len(nt_rec.get("N1-REGION", "")) + 
                                len(nt_rec.get("P5'D", ""))) + \
                                dgene_frag + \
                                '-' * (
                                len(nt_rec.get("P3'D", "")) + 
                                len(nt_rec.get("N2-REGION", "")) + 
                                len(nt_rec.get("P5'J", ""))) + \
                                jgene_frag
                        else:
                            g = v_ext + '-' * (len(germline) - len(v_ext) - len(jgene_frag)) + jgene_frag
                            
                        germline_vj = ""
                        for c in chunks(g, 3):
                            germline_vj += c if '-' not in c else '-'*len(c)
            
                        if 'x' in option:
                            report("Inferred germline_vdj:")
                            report(germline_vj)
                        sr = SeqRecord(Seq(germline_vj), id=id + "_germ_vdj", name=id + "_germ_vdj", description="")
                        consensus_j.append(sr)
                        if 'o' in option:
                            outrecs.append(sr)
                else:
                    report("%s: no junction." % id)
            except:
                report("Error processing input record " + id + ":")        
                exc_type, exc_value, exc_traceback = sys.exc_info()
                report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2))
                
        report("Processing successfully completed.")
        
    except:
        report("Error parsing input file: " + str(sys.exc_info()[1]))
        return

                
    if 'c' in option:
        try:
            def checklengths(srs):
                length = -1
                for sr in srs:
                    if length < 0:
                        length = len(sr.seq)
                    elif len(sr.seq) != length:
                        report("Length error in sequence %s" % sr.id)

            if 'f' in option:
                checklengths(consensus_f)
                summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_f))
                cd = summary.dumb_consensus(ambiguous="-")
                consensus = ""
                for c in chunks(cd, 3):
                    consensus += c if '-' not in c else '-'*len(c)
                report("'Full' germline consensus:")
                report(str(consensus))
                outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_full", name="consensus_germ_full", description=""))
            if 'v' in option:
                checklengths(consensus_v)
                summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_v))
                cd = summary.dumb_consensus(ambiguous="-")
                consensus = ""
                for c in chunks(cd, 3):
                    consensus += c if '-' not in c else '-'*len(c)
                report("Germline (v) consensus:")
                report(str(consensus))
                outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_v", name="consensus_germ_v", description=""))
            if 'j' in option:
                checklengths(consensus_j)
                summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_j))
                cd = summary.dumb_consensus(ambiguous="-")
                consensus = ""
                for c in chunks(cd, 3):
                    consensus += c if '-' not in c else '-'*len(c)
                report("Germline vdj consensus:")
                report(str(consensus))
                outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_vdj", name="consensus_germ_vdj", description=""))
        except:
            report("Error generating consensus: %s - %s" % (sys.exc_info()[0], sys.exc_info()[1]))
            
    if fixed_mut > 0:
        try:
            report("Mutation Analysis, showing mutations, insertions and deletions that are common to all sequences from a given germline.")
            report("This will be reported for all germlines for which there are at least %d sequences in the analysis:" % fixed_mut)
            def m_limits(m):
                # Find the upper and lower limits of the matchstr, ignoring leading and trailing deletions
                # limits are expressed as locations relative to the germline (insertions in the matchstr are ignored)
                for i in range(len(m)):
                    if m[i] != 'd':
                        mstart = i
                        break
                for i in range(len(m)-1, -1, -1):
                    if m[i] != 'd' and m[i] != 'i':
                        mend = i
                        break
                        
                loc = 0
                for i in range(len(m)):
                    if i == mstart:
                        start = loc
                    elif i == mend:
                        end = loc
                    if m[i] != 'i':
                        loc += 1
                return (start, end)
            
            for germline, mg in mutated_germs.iteritems():
                if len(mg) >= fixed_mut:
                    # given that the sequences may have different start and end points, compute
                    # the range over which we have coverage from a sufficient number of sequences
                    germseq = gl.seq(germline).seq
                    coverage = [0] * len(germseq)
                    for seq, matchstr in mg:
                        start, end = m_limits(matchstr)
                        for i in range(start, end+1):
                            coverage[i] += 1
                            
                    range_start = 999
                    range_end = -1
                    
                    for i, val in enumerate(coverage):
                        if val >= fixed_mut: 
                            if range_start > i:
                                range_start = i
                            if range_end < i:
                                range_end = i
                        
                    # matches[loc] holds:
                    # 'u' if this location has not as yet been observed in sequences processed
                    # 'm' if it has been observed to match the germline in sequences processed so far
                    # 'c,g,a,t' if it has been observed to be mutated to that value in sequences processed so far
                    # 'm' if it has been observed to be deleted in sequences processed so far
                    # 'x' if if the results at this location are not consistent between sequences
                    
                    matches = ['u'] * len(germseq)
                    insertions = []
                    range_encountered_start = 999
                    range_encountered_end = -1
                    
                    for seq, matchstr in mg:
                        ins = 0
                        loc = 0
                        inserts = []
                        (start, end) = m_limits(matchstr)
                        start = max(start, range_start)
                        end = min(end, range_end)
                        s = iter(seq)
                        for m in matchstr:
                            if m != 'i':
                                ins = 0
                                
                            if m == 'n':
                                sub = next(s)
                                if loc >= start and loc <= end:
                                    if matches[loc] == 'u':
                                        matches[loc] = sub
                                    elif matches[loc] != sub:
                                        matches[loc] = 'x'
                                loc += 1
                            elif m == 'd':
                                if loc >= start and loc <= end:
                                    if matches[loc] == 'u':
                                        matches[loc] = 'd'
                                    elif matches[loc] != 'd':
                                        matches[loc] = 'x'
                                loc += 1
                            elif m == 'i':
                                if loc >= start and loc <= end:
                                    inserts.append((loc, ins))
                                ins += 1
                                next(s)
                            else:
                                if loc >= start and loc <= end:
                                    if matches[loc] == 'u':
                                        matches[loc] = 'm'
                                    elif matches[loc] != 'm':
                                        matches[loc] = 'x'
                                loc += 1
                                next(s)
                                        
                        # Add a new insertion to the consensus list if we see it in this sequence, and it is outside
                        # the range we've encountered so far. 
                        
                        for loc, ins in inserts:
                            if loc < range_encountered_start or loc > range_encountered_end:
                                insertions.append((loc, ins))
                        
                        # Remove insertions from the consensus list if they are in range of this sequence and were not
                        # observed in it
                        
                        for loc, ins in insertions:
                            if loc >= start and loc <= end:
                                if not (loc, ins) in inserts:
                                    insertions.remove((loc, ins))
                        
                        range_encountered_start = min(range_encountered_start, start)        
                        range_encountered_end = max(range_encountered_end, end)        
                
                    report("%s (%d sequences):" % (germline, len(mg)))
                    deletions = []
                    for loc, m in enumerate(matches):
                        if m == 'd':
                            deletions.append(loc)
                    if len(deletions) > 0:
                        report(" Common deletions: %s" % ', '.join([str(n) for n in sorted(deletions)]))
                    
                    if len(insertions) > 0:
                        report(" Common insertions: %s") % ', '.join(["%d.%d" % (loc, ins) for (loc, ins) in sorted(insertions)])
    
                    mutations = []
                    for loc, m in enumerate(matches):
                        if m in ('c', 'a', 'g', 't'):
                            mutations.append("%s%d%s" % (germseq[loc], loc, m))
                    if len(mutations) > 0:
                         report(" Common mutations: %s" % ', '.join([str(n) for n in mutations]))
                        
                    if len(insertions) + len(deletions) + len(mutations) > 0:
                        r_g = ""
                        gi = iter(germseq)
                        for m in matches:
                            r_g += next(gi) if m != 'i' else '-'
                        report( "germline:  %s" % r_g)
                        r_c = ""
                        gi = iter(germseq)
                        for m in matches:
                            if m == 'm':
                                r_c  += next(gi)
                            elif m == 'd':
                                r_c += '-'
                                next(gi)
                            elif m == 'i':
                                r_c += 'i'
                            elif m == 'u':
                                r_c += '.'
                                next(gi)
                            else:
                                r_c += m.upper()
                                next(gi)
                        report( "consensus: %s" % r_c)
                    else:
                        report(" No common insertions, deletions or mutations compared to gertmline")
                else:                            
                    report("%s (%d sequences) - number of sequences is below analysis threshold." % (germline, len(mg)))
        except:
            report("Error creating mutation report:")        
            exc_type, exc_value, exc_traceback = sys.exc_info()
            report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2))
            
    SeqIO.write(outrecs, output_file, "fasta")
Esempio n. 8
0
def generate_consensus(alignment):
    align_summary = AlignInfo.SummaryInfo(alignment)
    consensus = align_summary.dumb_consensus(threshold=0.51, ambiguous='N')
    return consensus
Esempio n. 9
0
#go through seqs, find probe taxa, record start and end
for rec in alignIN:
    ID = rec.id
    Seq = rec.seq
    SEQ = Seq.upper()
    if PROBEname in ID:
        #print(ID)
        start = min(SEQ.find("A"), SEQ.find("C"), SEQ.find("G"), SEQ.find("T"))
        end = max(SEQ.rfind("A"), SEQ.rfind("C"), SEQ.rfind("G"),
                  SEQ.rfind("T"))
        break
#print("Probe region- ID:", ID, "start:",start,"end:", end,"\n")

#make a 50% consesus seq
info = AlignInfo.SummaryInfo(alignIN)
consensus = info.dumb_consensus(threshold=.50, consensus_alpha=ambiguous_dna)
#print("Consensus:",consensus,"\n")
#consensus for the head and tail region used to get number of gaps needed to insert below
headcon = consensus[:start + 1]
tailcon = consensus[end:]
colsH = len(headcon)
colsT = len(tailcon)

#loop through each column by record, record scores for head and tail regions and add to dictionary
for rec in alignIN:
    Seq = rec.seq
    ID = rec.id
    for colIDX in range(colsL):
        col = alignIN[count:count + 1, colIDX]
        if colIDX < start:
Esempio n. 10
0
    from Bio.SeqRecord import SeqRecord
    os.chdir(sys.argv[1])
    listing = os.listdir(".")
    consensus = {}
    genConsensus = ''
    pssmGen = ''
    consensusThres = 0.7

    #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
    generalAlignment = AlignIO.parse(sys.argv[2],
                                     "fasta",
                                     alphabet=Gapped(
                                         IUPAC.ExtendedIUPACProtein(), "-"))
    lengthGenAl = 0
    for genAlignment in generalAlignment:
        sumGen = AlignInfo.SummaryInfo(genAlignment)
        genConsensus = sumGen.gap_consensus(consensusThres)
        #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
        pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
        lengthGenAl = len(genAlignment)

    for item in listing:
        if item.endswith(".fas"):
            #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
            alignments = AlignIO.parse(item,
                                       "fasta",
                                       alphabet=Gapped(
                                           IUPAC.ExtendedIUPACProtein(), "-"))
            for alignment in alignments:
                summ = AlignInfo.SummaryInfo(alignment)
                consensus[item] = summ.gap_consensus(consensusThres)
Esempio n. 11
0
    def Generate_Consensus_conserved_region(self):

        # Read the minimum length of the amolicons from the para.txt file:
        with open('para.txt') as para:
            self.alllines = para.readlines()
            for self.para_index, self.para_line in enumerate(self.alllines):
                self.seperate = self.para_line.split(':')
                if 'Minimum length of conserved regions' in self.para_line:
                    self.conserved_region_minimum_size = self.seperate[-1]
                    if self.conserved_region_minimum_size[-1] == '\n':
                        self.conserved_region_minimum_size = self.conserved_region_minimum_size[
                            0:-1]
                    else:
                        pass
# Read in the alignment file generated from last step and create consensus sequence from it:
        self.alignment = AlignIO.read(self.alignment_file, "clustal")
        self.summary_align = AlignInfo.SummaryInfo(self.alignment)
        self.consensus = self.summary_align.dumb_consensus(
            float(self.get_from_keyboard(self.entry1)))
        if self.consensus_seq_file:
            f = open(self.consensus_seq_file, "w+")
            f.write('>' + self.get_from_keyboard(self.entry) + '\n' +
                    str(self.consensus))
            f.close()

        self.star_list = []
        # Try to find out all the positions of the gaps, use 'try' because there may be no gaps, which may lead to errors in the next step:
        try:
            for self.index in range(0, len(self.consensus)):
                if str(self.consensus[self.index]) == 'X':
                    self.star_list.append(self.index)

            try:
                # Always append the first and the last bases of the sequence as gaps into the gap list:
                if self.star_list[0] == 0:
                    pass
                else:
                    self.star_list = [0] + self.star_list
                if self.star_list[-1] == len(self.consensus):
                    pass
                else:
                    self.star_list.append(len(self.consensus))

                for self.region_start in range(0, len(self.star_list) - 1):
                    self.region_end = self.region_start + 1
                    self.region = self.consensus[
                        self.star_list[self.region_start] +
                        1:self.star_list[self.region_end]]

                    if len(self.region) >= int(
                            self.conserved_region_minimum_size):
                        f = open(self.conserved_region_file, 'a+')
                        f.write('Length:' + str(len(self.region)) + '\n' +
                                str(self.region) + '\n'
                                'Startposition is: ' +
                                str(self.star_list[self.region_start]) + '\n' +
                                'Endposition is: ' +
                                str(self.star_list[self.region_end]) + '\n')
                        f.close()
                        self.__class__.position_pair.append([
                            self.star_list[self.region_start],
                            self.star_list[self.region_end]
                        ])

                    else:
                        pass
# If no gaps are found then the whole sequence is conserved:
            except IndexError:
                f = open(self.conserved_region_file, 'a+')
                f.write('Length:' + str(len(self.consensus)) + '\n' +
                        str(self.consensus) + '\n'
                        'Startposition is: 1' + '\n' + 'Endposition is ' +
                        str(len(self.consensus)) + '\n')
                f.close()
                self.__class__.position_pair.append([1, len(self.consensus)])
        except TypeError:
            pass
# Check if the conserved regions are generated or not, if not, then it means the threshold of consensus sequences are too high. Otherwise, inform users the positions the path of the file generated:
        try:
            if os.path.isfile(self.conserved_region_file) == False:
                self.textbox.insert(
                    INSERT,
                    'No conserved regions found under this threshold for' +
                    self.get_from_keyboard(self.entry) +
                    ', please lower the threshold!!!' + '\n')
            if os.path.isfile(self.conserved_region_file) == True:
                self.textbox.insert(
                    INSERT, 'You can get your detected conserved regions at' +
                    self.conserved_region_file + '\n')
        except FileNotFoundError:
            pass
Esempio n. 12
0
def create_alignment(filename, verbose=1, outfile='test.aligned.fasta'):
    """This function performs a MAFFT alignment"""

    jfilename = filename
    if not exists(jfilename):
        sys.exit("No such file: %s" % jfilename)

    # let's see if the given file isn't already aligned
    try:

        alignment = AlignIO.read(filename, 'fasta')
        jfilename = filename.rsplit('.', 1)[0]
        if verbose:
            print("File '%s' is already aligned. Skipping alignment step." %
                  filename)

    except:
        # deprecated:
        #cline = MultipleAlignCL( jfilename, cf.ClustalPath )
        # set output filename:
        #cline.set_output( jfilename+'.aln' )
        #cline.set_dna_matrix( cf.DNAMatrixPath )
        # ang. nedenstaaende: nuvaerende matrix er god i alle testede tilfaelde.
        ## current matrix is good in all tested cases.
        #cline = ClustalwCommandline( "clustalw", infile=jfilename, outfile=jfilename+'.aln', dnamatrix=cf.DNAMatrixPath )
        cline = [
            'mafft', '--localpair', '--maxiterate', '16', '--inputorder',
            '--preservecase', '--quiet', jfilename
        ]

        #cline.gap_open_pen = 0.001
        # dette sammen med multalinDNAmatrix.clustal giver bedre alignment
        # hvis sekvenserne er meget ens (se Version2/test.fasta)
        ##  this along with multalinDNAmatrix.clustal provides better alignment
        ##  if the sequences are very similar (see Version2/test.fasta)

        # hvis linien udkommenteres og mymatrix_identity5.clustal bruges i
        # stedet, giver det bedre alignment hvis sekvenserne ikke ligner hinanden
        # helt saa meget (?) (f.eks. Paper/NAR/revieweralignment.fasta)
        ##  if the line is commented out and mymatrix_identity5.clustal used
        ##  instead, it provides better alignment if the sequences do not
        ##  resemble each other quite so much (?)
        ##  (eg. Paper / NAR / revieweralignment.fasta)

        #cline.gap_ext_pen = 0.00 # 0.01 seems to give other regions, better?

        # normalt var gappen ikke sat.
        ## normally gaps were not set.

        # cline.max_div = 100

        if verbose:
            print('-- running this command:', cline)

# deprecated:
#alignment = Clustalw.do_alignment(cline)
        stdout = subprocess.Popen(cline, stdout=subprocess.PIPE,
                                  stderr=None).communicate()[0]
        with open(jfilename + '.mafft', 'wb') as handle:
            handle.write(stdout)
        alignment = AlignIO.read(jfilename + '.mafft', 'fasta')

    # make sure sequences are upper case
    for rec in alignment:
        rec.seq = rec.seq.upper()

    #
    # indices from 0 to len-1
    #
    # things you can do with the returned alignment object:
    #
    # deprecated:
    #allseq = alignment.get_all_seqs()
    allseq = list(alignment)
    summary = AlignInfo.SummaryInfo(alignment)
    l = alignment.get_alignment_length()

    return (allseq, summary, l)
Esempio n. 13
0
myConsensus = open('consensus', "r")
print_consensus = myConsensus.readlines()
print(print_consensus)
shutil.copy2('consensus', 'consensus.fasta')
myCons = ""
for myLine in print_consensus:
    if myLine.startswith('>'):
        myCons = ""
    else:
        myLine = myLine[:-1]
        myCons = "".join((myCons, myLine))
myConsensus.close()

#5. Calculation of the abundance matrices - biopython module
myMatrixinput = AlignIO.read(myAlignmentOutput, "clustal")
mySummary_align = AlignInfo.SummaryInfo(myMatrixinput)
myFrequency_matrix = mySummary_align.pos_specific_score_matrix(myCons)
myFrequency_matrix_str = str(myFrequency_matrix)

#Abundance matrix in absolute values
print(myFrequency_matrix_str)
myAbunout = open('abundance_matrix.txt', 'w')
myAbunout.write(myFrequency_matrix_str)
myAbunout.close()

#Abundance matrix in percentage
myAbunout2 = open('abundance_matrix.txt', 'r')
myLines = myAbunout2.readlines()
myAbunout2.close()

myAbunperout = open('abundance_matrix_percentage.txt', 'w')
Esempio n. 14
0
    def createFirstGuessReferenceFromReads(self):   
        #TODO: I should make this a commandline parameter. More = MSA takes longer. Less = worse reference
        msaReadCount = 4
        
        print ('I choose ' + str(msaReadCount) + ' random reads.'
            + '\nThese are aligned to form a rough initial consensus sequence. Here:'
            + '\n' + join(self.outputRootDirectory,'Initial_Reference')
            + '\nPerforming ClustalO Multiple Sequence Alignment Now...')
        try:            
            # Load Reads from File

            parsedReads = list(parse(self.readInput, self.readInputFormat))            
            referenceSequence = None

            
            # Reference Directory
            referenceDirectory = join(self.outputRootDirectory,'Initial_Reference')
            if not isdir(referenceDirectory):
                makedirs(referenceDirectory)
                        
            if (len(parsedReads) > msaReadCount):
                

                # Select a subset of reads for Multiple SequneceAlignment. Randomly, i guess.
                randomIndexes = list(range(0, len(parsedReads)))
                shuffle(randomIndexes)                
                rawClustalReads = []
                for i in range(0,msaReadCount):
                    rawClustalReads.append(parsedReads[randomIndexes[i]])
              
                rawClustalReadsFilename = join(referenceDirectory, 'MSARaw.fasta')                
                rawClustalReadsFileWriter = createOutputFile(rawClustalReadsFilename)        
                write(rawClustalReads, rawClustalReadsFileWriter, 'fasta')
                rawClustalReadsFileWriter.close()
            
                #Perform Clustal MSA
                clustalOAlignmentOutputFileName = join(referenceDirectory, 'clustalOAlignment.fasta')
                clustalOCommandLine = ClustalOmegaCommandline(infile=rawClustalReadsFilename, outfile=clustalOAlignmentOutputFileName, verbose=True, auto=True, force=True, threads=int(self.numberThreads))
                clustalOCommandLine()                
        
                # Calculate consensus 
                # A dumb consensus has lots of ambiguous nucleotides.  We'll polish those out later.
                alignmentType = 'fasta'    
                alignmentObject = read(clustalOAlignmentOutputFileName, alignmentType)           
                alignmentSummaryInfo = AlignInfo.SummaryInfo(alignmentObject)                
                dumbConsensus = alignmentSummaryInfo.dumb_consensus(threshold=.5)
                
                referenceSequence = SeqRecord(Seq(str(dumbConsensus) , IUPAC.IUPACUnambiguousDNA),
                    id='Initial_Consensus',
                    description='Initial_Consensus')

                
            # Else
            else:
                # Select the first read, use it as the reference. It's something.
                #referenceSequence = parsedReads[0]
                # You know what? we should just give up. There aren't enough reads to assemble.
                #raise Exception('Not enough reads to continue.')
                referenceSequence = SeqRecord(Seq('' , IUPAC.IUPACUnambiguousDNA),
                    id='Initial_Consensus',
                    description='Initial_Consensus')
                        
             
            #Write reference to file
            self.referenceSequenceFileName = join(referenceDirectory, 'FirstGuessReference.fasta')            
            firstGuessRefFileWriter = createOutputFile(self.referenceSequenceFileName)        
            write([referenceSequence], firstGuessRefFileWriter, 'fasta')

            firstGuessRefFileWriter.close()
            
            return self.referenceSequenceFileName
       
       
            print ('Done making initial consensus sequence.')
      
                                    
                                     
        except Exception:
            print ('Exception encountered in createFirstGuessReferenceFromReads()') 
            print (exc_info()[0])
            print (exc_info()[1])
            print (exc_info()[2]) 
            raise    
Esempio n. 15
0
def frequency():
    global consensus
    global frequency_matrix
    global aadict
    #read fasta file and detmine number of sequences.
    fasta_file = AlignIO.read(file_ali_out, "clustal")
    summary_align = AlignInfo.SummaryInfo(fasta_file)
    amount_seq = len(fasta_file)

    for record in fasta_file:
        print record.seq, record.id

    # make simple consensus sequence with treshold value. gap is marked with "-"
    consensus = summary_align.dumb_consensus(ambiguous="-",
                                             threshold=cons_tresh)
    print "\n", consensus
    info_file = open(project_dir + spacer + "info.txt", "a")
    info_file.write("\n consensus sequence: \n" + str(consensus))
    info_file.close()

    # establish the abundance of a certain residue at each position in consensus sequence.
    frequency_matrix = summary_align.pos_specific_score_matrix(consensus)
    frequency_matrix_str = str(frequency_matrix)
    print frequency_matrix_str

    #print tis matrix to a file
    print("writing file of frequencies")
    frequencies_file = open(
        project_dir + spacer + "matrices" + spacer + "frequencies.txt", "w")
    frequencies_file.writelines(frequency_matrix_str)
    frequencies_file.close()

    #read first line of the file (determines the AA present in the alignment
    frequency_file = open(
        project_dir + spacer + "matrices" + spacer + "frequencies.txt", "r")
    aa = frequency_file.readline()

    # making list of the AA, deleting blank space.
    aadict = []
    for i in range(len(aa)):
        if aa[i] != " ":
            aadict.append(aa[i])
    frequency_file.close()

    #delete last item in list because it is not an AA (\n)
    del aadict[-1]
    info_file = open(project_dir + spacer + "info.txt", "a")
    info_file.write("\n Amino Acids found in sequences: \n \t" + str(aadict))
    info_file.close()
    print aadict

    #make quotient of all input by amount of entries
    lines = range(0, len(consensus), 1)
    number_aa = int(len(aadict))

    #make quotient of all input by amount of entries
    n = 0
    #make first line of csv file
    procent_line = []
    global procent_complete
    global entropy_position_list
    procent_file = open(
        project_dir + spacer + "matrices" + spacer + "procent.csv", "a")
    procent_file.write(str(aadict) + "\n")
    procent_file.close()
    entropy_file = open(
        project_dir + spacer + "matrices" + spacer + "entropy.csv", "a")
    entropy_file.write(str(aadict) + "\n")
    entropy_file.close()
    for number in lines:
        entropy_list = []
        entropy_num = 0
        entropy_position = 0
        for AA in aadict:
            entropy_num += frequency_matrix[number][AA]

        for AA in aadict:
            #abundance
            procent = (1 - (frequency_matrix[number][AA] / amount_seq)) * 100
            procent_line.append(procent)

            #entropy
            # calculate p.
            quotient_entropy = frequency_matrix[number][AA] / entropy_num
            # due to error with log2(0) if is needed.
            if quotient_entropy != 0:
                log_entropy = math.log(quotient_entropy, 2)
                entropy = quotient_entropy * log_entropy
                entropy = math.fabs(entropy)
                entropy = entropy
                entropy_list.append(entropy)
            else:
                entropy_list.append(0.000)
        # set numbers to 3digits behind the comma.
        for entropy_item in entropy_list:
            entropy_position = float(entropy_position)
            entropy_position += float(entropy_item)
        # make list of all entropy values.
        log_number_aa = math.log(number_aa, 2)
        entropy_position = entropy_position / log_number_aa
        entropy_position = float(entropy_position)
        entropy_position = entropy_position * 100
        entropy_position = float(entropy_position)
        entropy_position = "%.2f" % entropy_position
        entropy_position_list.append(entropy_position)

        #write line to csv file
        print(procent_line)
        procent_line_str = str(procent_line)
        procent_file = open(
            project_dir + spacer + "matrices" + spacer + "procent.csv", "a")
        procent_file.write(procent_line_str)
        procent_file.write("\n")
        procent_file.close()
        procent_complete.append(procent_line)
        procent_line = []
        #starting new line
        n = 0
    entropy_file = open(
        project_dir + spacer + "matrices" + spacer + "entropy.csv", "a")
    entropy_file.write(str(entropy_position_list))
    entropy_file.close()
    print entropy_position_list
    print procent_complete
Esempio n. 16
0
        new_align = alignment[:, 1:]
        print "trimming first character"
        if bs > (len(back_str)/3):
            new_align = new_align[:, :-1]
            print "trimming last character"
            flag = 1
    if bs > (len(back_str)/3) and flag < 1:
        new_align = alignment[:, :-1]
        print "trimming last character but not first"
    return new_align

alignment = AlignIO.read(handle, "fasta")

clean_align = trim_consensus(alignment)
print clean_align
summary_align = AlignInfo.SummaryInfo(clean_align)

gap_consensus = summary_align.gap_consensus(threshold = 0, ambiguous = 'N', consensus_alpha=alphabet, require_multiple=1)

dumb_consensus = summary_align.dumb_consensus(threshold = 0, ambiguous = 'N', consensus_alpha=alphabet, require_multiple = 1)

outfile_gap = open(sys.argv[1]+"_gap_consensus.fasta", "w")
outfile_dumb = open(sys.argv[1]+"_dumb_consensus.fasta", "w")
outfile_ungap = open(sys.argv[1]+"_ungap_consensus.fasta", "w")

ungap_consensus = gap_consensus.ungap("-")

SeqIO.write(SeqRecord(gap_consensus, id="%s"%(sys.argv[1]+"_gap_consensus"), description=""), outfile_gap, "fasta")
SeqIO.write(SeqRecord(dumb_consensus, id="%s"%(sys.argv[1]+"_dumb_consensus"), description=""), outfile_dumb, "fasta")
SeqIO.write(SeqRecord(ungap_consensus, id="%s"%(sys.argv[1]+"_ungap_consensus"), description=""), outfile_ungap, "fasta")
def get_sumatyInfo(alignment):
    #objeto para informacion sumarizada
    summary_align = AlignInfo.SummaryInfo(alignment)
    return summary_align
Esempio n. 18
0
def clustalw2MSA(folderName, segList, startingIndex):
    filename = "seq" + str(startingIndex)
    lenMax = min(11, len(segList))
    '''
	indexNum = 10259
	if  False and  startingIndex + 80 > indexNum >  startingIndex + 20:
		GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta")
		str5 = GTDic["Segkk0"]
		print startingIndex , indexNum - startingIndex
		print startingIndex, str5[startingIndex:startingIndex+100]
		print indexNum, str5[indexNum-10:indexNum+10]
		print len(segList)
		
		segList.append(str5[startingIndex:startingIndex+100])
	'''

    IORobot.writeSegOut(segList[0:lenMax], folderName, filename + ".fasta")
    endThres = 2
    #cline = ClustalwCommandline("clustalw2", infile=folderName + filename + ".fasta", pwdnamatrix="matrix.txt",TRANSWEIGHT=0,  GAPOPEN=0, GAPEXT=0)
    cline = ClustalwCommandline("clustalw2",
                                infile=folderName + filename + ".fasta",
                                PWDNAMATRIX="matrix.txt",
                                pwgapopen=0,
                                pwgapext=0,
                                TRANSWEIGHT=0,
                                GAPOPEN=0,
                                GAPEXT=0)
    stdout, stderr = cline()

    align = AlignIO.read(folderName + filename + ".aln", "clustal")
    summary_align = AlignInfo.SummaryInfo(align)
    consensus = summary_align.gap_consensus(threshold=0)

    for eachalign in align:
        if eachalign.id == 'Segkk0':
            myseq = eachalign.seq

    ctTemplate = startingIndex
    modiList = []
    for i in range(len(consensus)):
        #if consensus[i] == 'X':
        #	print ctTemplate, consensus[i], myseq[i], i
        #if startingIndex + 80 > indexNum >  startingIndex + 20 and i  == 83:
        #	print  consensus[i] != myseq[i] and consensus[i] != 'X'
        #	print consensus[i], myseq[i], ctTemplate, i
        #	assert(False)
        #	print ctTemplate, consensus[i], myseq[i], i

        if consensus[i] != myseq[i] and consensus[i] != 'X':
            if consensus[i] == '-':
                modiList.append([ctTemplate - startingIndex, ctTemplate, 'd'])
            elif myseq[i] == '-':
                if len(modiList) > 1 and modiList[-1][0] == ctTemplate:
                    prevIndex = int(modiList[-1][1].split('_')[-2])
                    suffix = str(prevIndex + 1)
                else:
                    suffix = "0"

                modiList.append([
                    ctTemplate - startingIndex, ctTemplate,
                    'i_' + suffix + "_" + str(consensus[i])
                ])
            else:
                modiList.append([
                    ctTemplate - startingIndex, ctTemplate,
                    's_' + str(consensus[i])
                ])
                #print consensus, i , consensus[i], myseq[i]

        if myseq[i] != '-':
            ctTemplate += 1

    modiList.sort()
    newModiList = []

    for k in range(len(modiList)):
        eachitem = modiList[k]
        if 20 < eachitem[0] < 80:
            newModiList.append([eachitem[1], eachitem[2]])

        elif k < len(modiList) - 1 and modiList[k][0] <= 20 and modiList[
                k + 1][0] > 20 and abs(modiList[k + 1][0] -
                                       modiList[k][0]) < endThres:
            newModiList.append([eachitem[1], eachitem[2]])
        elif k > 0 and modiList[k][0] >= 80 and modiList[
                k - 1][0] < 80 and abs(modiList[k - 1][0] -
                                       modiList[k][0]) < endThres:
            newModiList.append([eachitem[1], eachitem[2]])
    '''
	indexNum = 10259 
	if   startingIndex + 80 > indexNum >  startingIndex + 20:
		command = "cat " + folderName + "seq.aln >> ./happy "
		os.system(command)
		print startingIndex , indexNum - startingIndex
		print consensus
		print newModiList
		#assert(False)

	'''
    return newModiList
Esempio n. 19
0
def create_consensus(in_fasta, in_metadata, index_field, index_column, lineage,
                     out_fasta, log_file):
    """
    Collapses sequences into consensus sequences based on grouping by index column or index field

    :param in_fasta: Fasta file with sequences that needs to be splitted according to criteria to create a consensus
    set by user according to metadata file. (Required)
    :param in_metadata: Matching metadata file with same naming convention as fasta file. Contains all sequence
    metadata that the user wants to split the fasta file by for consensus to be created. Metadata file must be in .csv
    format (Required)
    :param index_field: The matching criteria the fasta file needs to be splitted by. (Required)
    :param index_column: The column with matching sequence IDs with fasta file (Default: header). (Optional)
    :param lineage: Specific lineages the user wants to split by. All sub-lineages will be collapsed to the closest
    lineage (e.g. 1.1.2 to 1.1). (Optional)
    :param out_fasta: Output fasta file with consensus sequences of all groups based on trait (Default: consensus.fasta). (Optional)
    :param log_file: Output log file (Default: stdout). (Optional)

    :return:
    """
    metadata_dic = {}
    phylotype_dic = {}
    seq_dic = {}
    consensus_dic = {}
    output_folder = os.path.dirname(out_fasta)
    log_handle = get_log_handle(log_file, out_fasta)
    log_handle.write("Output folder: %s" % output_folder)

    with open(in_metadata, "r") as f:
        reader = csv.DictReader(f)
        reader.fieldnames = [name.lower() for name in reader.fieldnames]
        metadata = [r for r in reader]

    if index_field.lower() not in reader.fieldnames or index_column.lower(
    ) not in reader.fieldnames:
        sys.exit(
            "Column name not in metadata file, please re-check metadata file and reinsert a column name."
        )

    for items in metadata:
        if items[index_column] in metadata_dic.keys():
            print("Duplicate sequences with name: " + items[index_column] +
                  " in metadata file.",
                  file=log_handle)
        else:
            metadata_dic[items[index_column]] = items[index_field.lower()]

    for record in SeqIO.parse(in_fasta, 'fasta'):
        seq_dic[record.id] = record.seq

    if len(set(metadata_dic.keys()) & set(seq_dic.keys())) == 0:
        sys.exit("No matching sequence name with metadata name. Program Exit")

    if lineage != "":
        for clades in lineage:
            phylotype_dic[clades] = []

        trait_order = list(phylotype_dic.keys())
        trait_order.sort(key=lambda x: re.sub("[^A-Z0-9]", "", x),
                         reverse=True)

        for cluster in trait_order:
            for seq_id, phylotype in metadata_dic.items():
                cluster_type = cluster.split(".")
                cluster_length = len(cluster_type)
                phylo_type = phylotype.split(".")
                if len(phylo_type) < cluster_length:
                    continue
                if phylo_type[:cluster_length] == cluster_type:
                    if seq_id in seq_dic.keys():
                        phylotype_dic[cluster].append(
                            [seq_id, seq_dic[seq_id], phylotype])
                        del seq_dic[seq_id]
    else:
        for seq, trait in metadata_dic.items():
            if trait == "":
                print("Sequence " + seq + " have an empty " + trait +
                      " value.",
                      file=log_handle)
            if seq not in seq_dic.keys():
                print("Sequence " + seq +
                      " does not match metadata sequence name.",
                      file=log_handle)
                continue
            if trait not in phylotype_dic.keys():
                phylotype_dic[trait] = []
                phylotype_dic[trait].append([seq, seq_dic[seq]])
            else:
                phylotype_dic[trait].append([seq, seq_dic[seq]])

    for seq in seq_dic.keys():
        log_handle.write("Sequence " + seq +
                         " did not find any matches to metadata file.\n")

    for key in phylotype_dic.keys():
        if len(phylotype_dic[key]) > 2:
            print("Trait:" + key + "\t\tTotal Number:" +
                  str(len(phylotype_dic[key])),
                  file=log_handle)
            outfile_name = output_folder + key + ".fasta"
            outfile = open(outfile_name, "w")
            for sequences in phylotype_dic[key]:
                record = SeqRecord(sequences[1],
                                   id=sequences[0],
                                   description="")
                SeqIO.write(record, outfile, "fasta-2line")
            outfile.close()
            alignment_name = outfile_name[:-6] + "_alignment.fasta"
            align_command = "mafft " + outfile_name + " > " + alignment_name
            os.system(align_command)
            os.remove(outfile_name)
            alignment = AlignIO.read(alignment_name, 'fasta')
            consensus_name = key + "_consensus"
            summary_align = AlignInfo.SummaryInfo(alignment)
            consensus_seq = summary_align.dumb_consensus(threshold=0.0,
                                                         ambiguous='N')
            consensus_dic[consensus_name] = consensus_seq
            os.remove(alignment_name)
        else:
            log_handle.write(
                "Phylotype " + key +
                " does not have 2 or more sequences for an alignment to work.\n"
            )
    close_handle(log_handle)

    out_handle = get_out_handle(out_fasta)
    for key, value in consensus_dic.items():
        record = SeqRecord(value, id=key, description="")
        SeqIO.write(record, out_handle, "fasta-2line")
    close_handle(out_handle)
Esempio n. 20
0
    # print the alignment back out
    print(alignment.format("clustal"))

alignment = AlignIO.read(os.path.join(test_dir, test_names[0]),
                         "clustal",
                         alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))

# test the base alignment stuff
print('all_seqs...')
for seq_record in alignment:
    print('description: %s' % seq_record.description)
    print('seq: %r' % seq_record.seq)
print('length: %i' % alignment.get_alignment_length())

print('Calculating summary information...')
align_info = AlignInfo.SummaryInfo(alignment)
consensus = align_info.dumb_consensus()
assert isinstance(consensus, Seq)
print('consensus: %r' % consensus)

print('Replacement dictionary')
ks = sorted(align_info.replacement_dictionary(['N']).keys())
for key in ks:
    print("%s : %s" % (key, align_info.replacement_dictionary(['N'])[key]))

print('position specific score matrix.')
print('with a supplied consensus sequence...')
print(align_info.pos_specific_score_matrix(consensus, ['N']))

print('defaulting to a consensus sequence...')
print(align_info.pos_specific_score_matrix(chars_to_ignore=['N']))
Esempio n. 21
0
def cluster(blastdb, taxdb):
    Entrez.email = "*****@*****.**"
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    muscle_cline = MuscleCommandline(clwstrict=True)
    input_dic = {}
    multiple_dic = {}
    two_dic = {}
    problem_dic = {}
    finalseqs = set()
    multfinalseqs = []
    unresolved = []

    with open("multiple_gene_choices.txt") as o:
        line = o.readline()
        while line:
            input_dic[line.split("\t")[0]] = line.strip().split(
                "\t")[1].replace("[", "").replace("]", "").replace("'", "")
            line = o.readline()
    for i in input_dic:
        GIs = input_dic[i]
        GIs_list = GIs.split(", ")
        if len(GIs_list) > 2:
            multiple_dic[i] = GIs_list
        if len(GIs_list) == 2:
            two_dic[i] = GIs_list

    for i in multiple_dic:
        identities = []
        joined_GIs = ",".join(multiple_dic[i])
        handle = Entrez.efetch(db="nucleotide",
                               rettype="fasta",
                               retmode="text",
                               id=joined_GIs)
        seqs = SeqIO.parse(handle, "fasta")
        handle_string = StringIO()
        SeqIO.write(seqs, handle_string, "fasta")
        data = handle_string.getvalue()
        stdout, stderr = muscle_cline(stdin=data)
        align = AlignIO.read(StringIO(stdout), "clustal")
        summary_align = AlignInfo.SummaryInfo(align)
        consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N')
        consensus_record = SeqRecord(consensus, id="Consensus_all")
        for m in multiple_dic[i]:
            error = True
            while error == True:
                try:
                    handle = Entrez.efetch(db="nucleotide",
                                           rettype="fasta",
                                           retmode="text",
                                           id=m)
                    error = False
                except:
                    print('Error, trying again')
                    time.sleep(10)
            seqs = SeqIO.read(handle, "fasta")
            handle_string = StringIO()
            SeqIO.write(seqs, handle_string, "fasta")
            SeqIO.write(consensus_record, handle_string, "fasta")
            data = handle_string.getvalue()
            stdout, stderr = muscle_cline(stdin=data)
            align = AlignIO.read(StringIO(stdout), "clustal")
            count = 0
            gaps = 0
            for col in range(0, len(align[0])):
                column = align[:, col]
                if "-" not in column:
                    if column[1:] == column[:-1]:
                        count = count + 1
                else:
                    gaps = gaps + 1
            iden = 100 * (count / float((len(align[0]) - gaps)))
            identities.append(iden)
        if identities.count(max(identities)) == 1:
            finalseqs.add(multiple_dic[i][identities.index(max(identities))])
        else:
            problem_dic[i] = multiple_dic[i]
            GI_to_pick = [
                multiple_dic[i][m] for m, x in enumerate(identities)
                if x == max(identities)
            ]
            multfinalseqs.append(GI_to_pick)

    for i in two_dic:
        #align the two seqs
        list_of_GIs = two_dic[i]
        alignment = alignment_reg(list_of_GIs)
        iden = identity_calc(alignment)
        if iden < 95:
            #            print("Low Aligned Identity: " + str(iden))
            alignment = alignment_rev_comp(list_of_GIs)
            iden = identity_calc(alignment)
            if iden < 95:
                #get taxonomy for query(main species)
                #               print("Low Reverse Complement Aligned Identity: " + str(iden))
                alignment = alignment_comp(list_of_GIs)
                iden = identity_calc(alignment)
                if iden < 95:
                    #                   print("Low Complement Aligned Identity: " + str(iden))
                    #add tiling thing
                    gene_name = '_'.join(i.split('_')[1:])
                    idens, start_stop = tiling(list_of_GIs, gene_name)
                    current_start = -1
                    current_stop = -1
                    result = []
                    if all(m > 70 for m in idens):
                        for start, stop in sorted(start_stop):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        if len(result) == len(start_stop):
                            #                           print("Seqs align to different regions of probe, choosing all")
                            multfinalseqs.append(list_of_GIs)
                        else:
                            #                           print('Seqs overlap: Printing to file for hand checking')
                            with open('these_seqs_overlap_cluster.txt',
                                      'a') as a:
                                unresolved.append(list_of_GIs)
                                a.write(str(list_of_GIs) + '\n')
                    else:
                        #get taxonomy for query(main species)
                        print("Parsing taxonomy for error sequences")
                        hits = blast(i, list_of_GIs, c)
                        #if theres only one lowest taxonomy hit, change
                        if hits.count(min(hits)) == 1:
                            finalseqs.add(
                                str(two_dic[i][hit_levels.index(min(hits))]))
#                            print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit")
                        else:  #there are multiple lowest taxonomy hits
                            multfinalseqs.append(two_dic[i])
                            problem_dic[i] = two_dic[i]
#                           print('Taxonomies had the multiple closest hits')
                else:
                    multfinalseqs.append(two_dic[i])
#                    print("Complement iden: " + str(iden) + " so pair is fine")
            else:
                multfinalseqs.append(two_dic[i])
#                print("Reverse Complement iden: " + str(iden) + " so pair is fine")
        else:
            multfinalseqs.append(two_dic[i])
#            print("High Aligned Identity: " + str(iden) + " so pair is fine")

    print("length of resolved = " + str(len(finalseqs)))
    print("length of choose multiple = " + str(len(multfinalseqs)))
    print("length of unresolved = " + str(len(unresolved)))

    with open("final_GIs.txt", "a") as o:
        for m in finalseqs:
            o.write(str(m) + "\n")

    with open("choose_mult.txt", "a") as o:
        for m in [num for pair in multfinalseqs for num in pair]:
            o.write(str(m) + "\n")
Esempio n. 22
0
def ClusterFams(dirClust, dCLustID, strOutputFile, dThresh, strMUSCLE):
    #Clusters all of the family files made by MakeFamilyFastaFiles.

    dirFams = dirClust + os.sep + "fams"
    dirCentroids = dirFams + os.sep + "centroids"
    dirUC = dirFams + os.sep + "uc"

    if not os.path.exists(dirClust):
        os.makedirs(dirClust)
    if not os.path.exists(dirFams):
        os.makedirs(dirFams)
    if not os.path.exists(dirCentroids):
        os.makedirs(dirCentroids)

    if not os.path.exists(dirUC):
        os.makedirs(dirUC)

    #sys.stderr.write( dirCentroids + "\n")
    #sys.stderr.write( str(glob.glob(dirFams+os.sep+'*.faa')) + "\n")
    for fileFasta in glob.glob(dirFams + os.sep + '*.faa'):
        #sys.stderr.write("The file is " + fileFasta + " \n")
        fileClust = dirCentroids + os.sep + os.path.basename(fileFasta)
        fileAlign = dirFams + os.sep + os.path.basename(fileFasta) + ".aln"
        strSeqID = os.path.basename(fileFasta)
        strSeqID = strSeqID.replace(".faa", "")

        iSeqCount = 0
        #Count seqs, if more than one, then align them
        for seq in SeqIO.parse(fileFasta, "fasta"):
            iSeqCount += 1

        if iSeqCount > 1:
            #Call muscle to produce an alignment
            subprocess.check_call(
                [strMUSCLE, "-in",
                 str(fileFasta), "-out",
                 str(fileAlign)])

            # Use BioPython's "dumb consensus" feature to get consensus sequence
            algnFasta = AlignIO.read(str(fileAlign), "fasta")

            seqConsensus = str(
                AlignInfo.SummaryInfo(algnFasta).dumb_consensus(
                    threshold=dThresh, ambiguous='X'))
            seqConsensus = SeqRecord(Seq(seqConsensus), id=strSeqID)

            SeqIO.write(seqConsensus, str(fileClust), "fasta")
            """
			# We previously used EMBOSS-CONS to produce consensus sequences
			# Call cons or em_cons from the EMBOSS package to produce a consensus sequence
			subprocess.check_call(["cons", "-seq", str(fileAlign), "-outseq", str(fileClust)])
			"""
        else:
            shutil.copyfile(fileFasta, fileClust)

    ageneAllGenes = []

    for fileFasta in glob.glob(dirCentroids + os.sep + '*.faa'):
        for gene in SeqIO.parse(fileFasta, "fasta"):
            gene.id = os.path.basename(os.path.splitext(fileFasta)[0])
            ageneAllGenes.append(gene)
    """
	for gene in ageneAllGenes:
		mtch = re.search(r'centroid=(.*)',gene.id)
		if mtch:
			gene.id = mtch.group(1)
		else:
			gene.id = os.path.splitext()
	"""

    SeqIO.write(ageneAllGenes, strOutputFile, "fasta")
Esempio n. 23
0
 def consensus(self):
     aln = Align.MultipleSeqAlignment(self.members)
     info = AlignInfo.SummaryInfo(aln)
     seq = info.dumb_consensus(threshold=self.consensus_threshold,
                               ambiguous='N')
     return SeqRecord.SeqRecord(seq, name='consensus', id='consensus')
def performClustalWAlignmentsForGroupwiseReference(outputDirectory, alleleFullList, runAlignments):
    print ('Performing clustalW Alignments and finding consensus sequences')

    # Create the output directories for clustalw.
    clustalwOutputDirectory = join(outputDirectory, 'ClustalwAlignmentsAPD')
    if not os.path.isdir(clustalwOutputDirectory):
        os.mkdir(clustalwOutputDirectory)

    clustalwConsensusOutputDirectory = clustalwOutputDirectory.replace('Alignments','Consensus')
    if not os.path.isdir(clustalwConsensusOutputDirectory):
        os.mkdir(clustalwConsensusOutputDirectory)
    #clustalwAlignmentScriptFile = createOutputFile(clustalwAlignmentScriptFileName)


    alleleGroups = getAlleleGroups(alleleFullList)
    for index, alleleGroup in enumerate(alleleGroups):
        
        print('(' + str(index + 1) + '/' + str(len(alleleGroups)) + '): HLA-' + alleleGroup.Gene + '*' + alleleGroup.AlleleGroup)

        if (True):
        #if (alleleGroup.Gene in genesForAnalysis):
            outputGroupFileName = join(outputDirectory, 
                join('AlleleGroupsAPD',alleleGroup.FileName))

            clustalwAlignmentOutputFileName = outputGroupFileName.replace(
                '.fasta','.aln').replace('/AlleleGroupsAPD/','/ClustalwAlignmentsAPD/')

            clustalwConsensusOutputFileName = outputGroupFileName.replace('/AlleleGroupsAPD/','/ClustalwConsensusAPD/')
     
            # If the alignment does not already exist
            if not (os.path.isfile(clustalwAlignmentOutputFileName)):

                # if there is more than one allele in the group
                if (len(alleleGroup.Alleles) > 1):
                    print (str(len(alleleGroup.Alleles)) + ' Alleles Found.')                

                    clustalwCommandline = ClustalwCommandline("clustalw", infile=outputGroupFileName, outfile=clustalwAlignmentOutputFileName)
                    print ('ClustalW Alignment Commandline:\n' + str(clustalwCommandline))

                    if (runAlignments):

                       # print ('Performing Clustalw Alignment...')
                        #clustalwAlignmentScriptFile.write(str(clustalwCommandline) + '\n') 
                        #Perform the alignment
                        clustalwCommandline()

                        if (os.path.isfile(clustalwAlignmentOutputFileName)):  
                            # If consensus does not exist yet
                            if not (os.path.isfile(clustalwConsensusOutputFileName)):  
                                #Perform the consensus
                                alignmentType = 'clustal'

                                align = AlignIO.read(clustalwAlignmentOutputFileName, alignmentType)
                            
                                #print ('Consensus FileName = ' + clustalwConsensusOutputFileName)
                            
                                summary_align = AlignInfo.SummaryInfo(align)
                                
                                dumb_consensus = summary_align.dumb_consensus()
                                #print('LengthDumbConsensus:' + str(len(dumb_consensus)))
                                gap_consensus = summary_align.gap_consensus()
                                #print('LengthGapConsensus:' + str(len(gap_consensus)))
                                #print ('Consensus=' + str(gap_consensus))

                                # Print Consensus to fasta.

                                # I can cheat and just create an HLA_Allele object, and print that.
                                currentAllele = HLA_Allele()
                                # I think I'll use the dumb consensus.  The only difference is that a gap consensus allows gaps.
                                currentAllele.APDSequence = str(dumb_consensus)
                                currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','')
                                currentAllele.outputDirectory = outputDirectory
                                #print ('Consensus2=' + currentAllele.APDSequence)
                                printFasta([currentAllele], clustalwConsensusOutputFileName, True, False, False)

                                pass
                            else:
                                print ('Consensus file ' + clustalwConsensusOutputFileName + ' already exists.  Moving on...')
                        else:
                            print ('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) 
                            #raise Exception('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) 
                            pass

                    else:
                        print ('Not running Alignments because you told me not to.')

                # There is only one allele in this group.
                else:
                    print ('Only one allele found')
                    currentGene = alleleGroup.Alleles[0].geneName

                    # Only class 1.
                    #if (currentGene in genesForAnalysis):
                    currentAllele = HLA_Allele()
                    currentAllele.sequence = alleleGroup.Alleles[0].sequence
                    currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','')
                    printFasta([currentAllele], clustalwConsensusOutputFileName, False, False, False)
          

            else:
                print ('Alignment file ' + clustalwAlignmentOutputFileName + ' already exists.  Moving on...')

        else:
            print ('Skipping alignment, because this gene isnt included in genesForAnalysis')
Esempio n. 25
0
def processing(raw_fasta_path, out_dir_path):
    if not os.path.exists(out_dir_path):
        logging.info("Making directory {0}".format(out_dir_path))
        os.makedirs(out_dir_path)

    deduplicated_fasta = remove_duplicates(SeqIO.parse(raw_fasta_path,
                                                       "fasta"))
    base = os.path.basename(raw_fasta_path)
    fasta_path = os.path.join(out_dir_path, base)

    logging.info("Writing FASTA in {0}".format(fasta_path))
    SeqIO.write(deduplicated_fasta, fasta_path, "fasta")

    # Multiple sequence alignment
    cline = ClustalwCommandline("clustalw2", infile=fasta_path)
    stdout, stderr = cline()
    logging.info(cline)

    clustalw_result_path = fasta_path.replace(".fasta", ".aln")

    alignment_dict = SeqIO.to_dict(
        AlignIO.read(clustalw_result_path, "clustal"))

    # writing alignment table in .txt
    with open(os.path.join(out_dir_path, "alignment.txt"), "w") as fout:
        fout.write("\n".join(
            str(record.seq) for record in alignment_dict.itervalues()))

    # alignment tree drawing
    tree_path = fasta_path.replace(".fasta", ".dnd")
    tree = Phylo.read(tree_path, "newick")
    tree.ladderize()

    # with labels
    Phylo.draw_graphviz(tree, label_func=lambda x: x.name.replace("ID=", ""))
    plt.savefig(os.path.join(
        out_dir_path, "figure_with_labels.pdf"))  # need pygraphviz, pylab

    # Clustering
    ids = dict(enumerate(alignment_dict.keys()))
    distance_matrix = np.zeros([len(ids)] * 2)
    for i, j in itertools.combinations(xrange(len(ids)), r=2):
        distance_matrix[i][j] = distance_matrix[j][i] = \
            distance(alignment_dict[ids[i]], alignment_dict[ids[j]])

    # Compute and plot dendrogram
    fig = plt.figure()
    axdendro = fig.add_axes([0.09, 0.1, 0.2, 0.8])
    Y = linkage(distance_matrix, method="centroid")
    cutoff = 0.5 * max(Y[:, 2])
    clusters = fcluster(Y, cutoff, "distance")
    Z = dendrogram(Y, orientation="right", color_threshold=cutoff)
    axdendro.set_yticks([])

    # Plot distance matrix
    axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.8])
    index = Z["leaves"]
    distance_matrix = distance_matrix[index, :]
    distance_matrix = distance_matrix[:, index]
    im = axmatrix.matshow(distance_matrix, aspect="auto", origin="lower")
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # Plot colorbar
    axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.8])
    plt.colorbar(im, cax=axcolor)

    # Display and save figure
    dendogram_path = os.path.join(out_dir_path, "dendogram.png")
    fig.savefig(dendogram_path)

    fasta_clusters = defaultdict(list)
    for i, cluster in enumerate(clusters):
        fasta_id = ids[i]
        fasta_clusters[cluster].append(alignment_dict[fasta_id])

    # Saving information about clusters
    clusters_dir_path = os.path.join(out_dir_path, "clusters")
    if not os.path.exists(clusters_dir_path):
        os.makedirs(clusters_dir_path)
    clusters_meta_path = os.path.join(clusters_dir_path, "clusters_meta.txt")
    meta_file = open(clusters_meta_path, "w")
    for cluster_id, cluster in fasta_clusters.iteritems():
        cluster_path = os.path.join(clusters_dir_path,
                                    "cluster_{0}.fasta".format(cluster_id))
        SeqIO.write(cluster, cluster_path, "fasta")
        summary_align = AlignInfo.SummaryInfo(MultipleSeqAlignment(cluster))
        consensus = summary_align.dumb_consensus()
        pssm = summary_align.pos_specific_score_matrix(consensus,
                                                       chars_to_ignore=['-'])
        frequencies = dict.fromkeys(IUPAC.protein.letters, 0)
        frequencies.update(
            (key, len(list(group)))
            for key, group in itertools.groupby(sorted(consensus)))
        frequencies.pop("X")

        meta_file.write("""Cluster ID: {0}
Cluster size: {1}
Consensus:
{2}

PSSM:
{3}
Frequencies in consensus:
{4}


""".format(cluster_id, len(cluster), textwrap.fill(str(consensus)), pssm,
           pprint.pformat(frequencies)))

        fig = plt.figure()
        pos = np.arange(len(IUPAC.protein.letters))
        width = .5  # gives histogram aspect to the bar diagram

        ax = plt.axes()
        ax.set_xticks(pos + (width / 2))
        ax.set_xticklabels(IUPAC.protein.letters)

        plt.bar(pos, [frequencies[letter] for letter in IUPAC.protein.letters],
                width,
                color='r')
        frequencies_path = os.path.join(
            clusters_dir_path, "frequencies_{0}.png".format(cluster_id))
        fig.savefig(frequencies_path)
Esempio n. 26
0
    def test_read_write_clustal(self):
        """Test the base alignment stuff."""
        path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
        alignment = AlignIO.read(path,
                                 "clustal",
                                 alphabet=Alphabet.generic_dna)
        self.assertEqual(len(alignment), 7)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description,
                         "gi|6273285|gb|AF191659.1|AF191")
        self.assertEqual(
            seq_record.seq,
            Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA"
                ))
        seq_record = alignment[1]
        self.assertEqual(seq_record.description,
                         "gi|6273284|gb|AF191658.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[2]
        self.assertEqual(seq_record.description,
                         "gi|6273287|gb|AF191661.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[3]
        self.assertEqual(seq_record.description,
                         "gi|6273286|gb|AF191660.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[4]
        self.assertEqual(seq_record.description,
                         "gi|6273290|gb|AF191664.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[5]
        self.assertEqual(seq_record.description,
                         "gi|6273289|gb|AF191663.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA"
        )
        seq_record = alignment[6]
        self.assertEqual(seq_record.description,
                         "gi|6273291|gb|AF191665.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        self.assertEqual(alignment.get_alignment_length(), 156)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus()
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(
            consensus,
            "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        dictionary = align_info.replacement_dictionary(["N", "-"])
        self.assertEqual(len(dictionary), 16)
        self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
        self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
        matrix = align_info.pos_specific_score_matrix(consensus, ["N", "-"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        matrix = align_info.pos_specific_score_matrix(
            chars_to_ignore=["N", "-"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        second_seq = alignment[1].seq
        matrix = align_info.pos_specific_score_matrix(second_seq, ["N", "-"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
-  0.0 0.0 0.0 3.0
-  3.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")
        value = align_info.information_content(5, 50, chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 88.42, places=2)
        value = align_info.information_content(chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        e_freq_table = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
        value = align_info.information_content(e_freq_table=e_freq_table,
                                               chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        self.assertEqual(align_info.get_column(1), "AAAAAAA")
        self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2)
        self.assertEqual(align_info.get_column(7), "TTTATTT")
        self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2)
        handle = StringIO()
        AlignInfo.print_info_content(align_info, fout=handle)
        self.assertEqual(
            handle.getvalue(), """\
0 T 2.000
1 A 2.000
2 T 2.000
3 A 2.000
4 C 2.000
5 A 2.000
6 T 2.000
7 T 1.408
8 A 2.000
9 A 2.000
10 A 2.000
11 G 2.000
12 A 1.015
13 A 2.000
14 G 2.000
15 G 2.000
16 G 2.000
17 G 2.000
18 G 2.000
19 A 2.000
20 T 2.000
21 G 2.000
22 C 2.000
23 G 2.000
24 G 2.000
25 A 2.000
26 T 2.000
27 A 2.000
28 A 2.000
29 A 2.000
30 T 2.000
31 G 2.000
32 G 2.000
33 A 2.000
34 A 2.000
35 A 2.000
36 G 2.000
37 G 2.000
38 C 2.000
39 G 2.000
40 A 2.000
41 A 2.000
42 A 2.000
43 G 2.000
44 A 2.000
45 A 2.000
46 A 2.000
47 G 2.000
48 A 2.000
49 A 2.000
50 T 2.000
51 A 2.000
52 T 2.000
53 A 2.000
54 T 2.000
55 A 2.000
56 - 0.682
57 - 0.682
58 - 0.333
59 - 0.333
60 - -0.115
61 - -0.115
62 - -0.115
63 - -0.115
64 - -0.115
65 - -0.115
66 A 2.000
67 T 2.000
68 A 2.000
69 T 2.000
70 A 2.000
71 T 2.000
72 T 2.000
73 T 2.000
74 C 1.408
75 A 1.408
76 A 2.000
77 A 2.000
78 T 2.000
79 T 2.000
80 T 1.015
81 C 2.000
82 C 2.000
83 T 2.000
84 T 2.000
85 A 2.000
86 T 2.000
87 A 2.000
88 T 2.000
89 A 2.000
90 C 1.137
91 C 2.000
92 C 2.000
93 A 2.000
94 A 2.000
95 A 2.000
96 T 2.000
97 A 2.000
98 T 2.000
99 A 2.000
100 A 2.000
101 A 2.000
102 A 2.000
103 A 2.000
104 T 2.000
105 A 2.000
106 T 2.000
107 C 2.000
108 T 2.000
109 A 2.000
110 A 2.000
111 T 2.000
112 A 2.000
113 A 2.000
114 A 2.000
115 T 2.000
116 T 2.000
117 A 2.000
118 G 2.000
119 A 2.000
120 T 2.000
121 G 2.000
122 A 2.000
123 A 2.000
124 T 2.000
125 A 2.000
126 T 2.000
127 C 2.000
128 A 2.000
129 A 2.000
130 A 2.000
131 G 2.000
132 A 2.000
133 A 2.000
134 T 2.000
135 C 2.000
136 C 1.408
137 A 2.000
138 T 2.000
139 T 2.000
140 G 2.000
141 A 2.000
142 T 2.000
143 T 2.000
144 T 2.000
145 A 2.000
146 G 2.000
147 T 2.000
148 G 1.408
149 T 2.000
150 A 2.000
151 C 2.000
152 C 2.000
153 A 2.000
154 G 2.000
155 A 2.000
""")
Esempio n. 27
0
def divergence(fastain, patient_id, cutoff):
    # fasta = open('%s' % filename, 'r')

    split_fasta = split(fastain, 1)
    seqs_by_timepoint = split_fasta[0]
    total_seq = split_fasta[1]

    # conseq = consensus.seq[(sites_pos[0]-1):(sites_pos[1]-1)]
    # conseq = Seq(str(consensus).replace('-','N'))
    # consensus = Seq(conseq.seq.tostring().replace('-','N'))

    # seq_length = len(consensus)
    mean_divergence = []
    median_divergence = []

    lower_divergence_25 = []
    upper_divergence_75 = []
    lower_divergence_5 = []
    upper_divergence_95 = []
    divergence_std = []
    mean_N_divergence = []
    median_N_divergence = []

    lower_N_divergence_25 = []
    upper_N_divergence_75 = []
    lower_N_divergence_5 = []
    upper_N_divergence_95 = []
    N_divergence_std = []
    mean_S_divergence = []
    median_S_divergence = []
    lower_S_divergence_25 = []
    upper_S_divergence_75 = []
    lower_S_divergence_5 = []
    upper_S_divergence_95 = []
    S_divergence_std = []
    dN = []
    dN_med = []
    dN_lower_25 = []
    dN_upper_75 = []
    dN_lower_5 = []
    dN_upper_95 = []
    dN_std = []
    dS = []
    dS_med = []
    dS_lower_25 = []
    dS_upper_75 = []
    dS_lower_5 = []
    dS_upper_95 = []
    dS_std = []
    patient = []

    # parts = str.split(fastain, "/")
    # parts2 = str.split(parts[len(parts)-1], "_")

    patient.append(patient_id)

    nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None)

    sorted_timepoints = seqs_by_timepoint.keys()
    sorted_timepoints.sort(key=natural_keys)

    print sorted_timepoints
    first_timepoint = AlignIO.MultipleSeqAlignment(
        seqs_by_timepoint[sorted_timepoints[0]])

    consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus(
        threshold=0.01).upper()
    conseq = Seq(str(consensus).replace('X', 'N'))

    prot = ""
    if "gag" in fastain:
        prot = "gag"
    else:
        prot = "gp41"

    sampleTimes = []
    for t in sorted_timepoints:
        sampleTimes.append(float(t))

    # for f in filelist:
    for t in range(0, len(sorted_timepoints)):

        divergence = []
        divergence_N = []
        divergence_S = []
        divergence_dN = []
        divergence_dS = []
        # diff = 0

        seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]]

        seq_length = len(seqs_at_t[0].seq)

        seq_freq = get_seq_freq(seqs_at_t)

        seqs_at_t_array = np.asarray(seqs_at_t)

        # i want to calculate derived freq wrt to consequence not minor freq per site
        #for c in xrange(0,len(consensus_seqs)):

        full_der_freq = []

        total_site_freq = []

        for i in range(seq_length):

            site_a = seqs_at_t_array[:, i]

            anc_freq = 0
            der_freq = 0

            #gap_count = "".join(site_a).count('-')

            for j in range(0, len(seq_freq)):

                if site_a[j] != '-':
                    if conseq[i].lower() == site_a[j]:
                        anc_freq += seq_freq[j]
                    else:
                        der_freq += seq_freq[j]

                # if (site_a[j] == 'a'):
                #     A += seq_freq[j]
                # elif (site_a[j] == 'c'):
                #     C += seq_freq[j]
                # elif (site_a[j] == 't'):
                #     T += seq_freq[j]
                # elif (site_a[j] == 'g'):
                #     G += seq_freq[j]

            total_seq = sum([der_freq, anc_freq])

            full_der_freq.append(der_freq)

            total_site_freq.append(total_seq)

            #print [der_freq, anc_freq], total_seq
            #total_site_freq_per_consensus.append(total_site_freq)
            #full_der_freq_per_consensus.append(full_der_freq)

        #for c in xrange(0, len(consensus_seqs)):
        for i in range(seq_length):

            # print i, full_der_freq[i], patient_id, sorted_timepoints[t], total_seq, float(
            #     full_der_freq[i]) / float(total_seq)
            diff = 0
            diff_N = 0
            diff_S = 0
            count = total_site_freq[i]
            count1 = 0
            if full_der_freq[i] > cutoff * total_seq:

                for each in seqs_at_t:

                    parts = str.split(each.name, "_")
                    freq = int(parts[2].strip())

                    seq = Seq(str(each.seq).upper().replace('-', 'N'))

                    if (str(conseq[i]) != "N"):

                        if (str(seq[i]) != "N"):

                            count1 += freq

                            if (conseq[i] != seq[i]):

                                codon = []

                                if (i % 3 == 0):
                                    cp = i
                                    cp_a = i + 1
                                    cp_b = i + 2

                                    codon = [cp, cp_a, cp_b]

                                elif (i % 3 == 1):
                                    cp_a = i - 1
                                    cp = i
                                    cp_b = i + 1

                                    codon = [cp_a, cp, cp_b]

                                else:

                                    cp_a = i - 2
                                    cp_b = i - 1
                                    cp = i

                                    codon = [cp_a, cp_b, cp]

                                consensus_aa = conseq[codon[0]:(
                                    codon[2] + 1)].translate()
                                current_aa = seq[codon[0]:(codon[2] +
                                                           1)].translate()

                                # print(str(consensus_aa), str(current_aa))
                                if 'X' in conseq[codon[0]:(codon[2] + 1)]:
                                    break

                                if (str(consensus_aa) != str(current_aa)):

                                    diff_N += freq
                                else:
                                    diff_S += freq

                                #print i, current_aa, consensus_aa, diff_N, diff_S, each.name, freq
                                diff += freq

                        #print each.name, sorted_timepoints[t], "d", float(diff), i, seq_length, count

            print(count, count1, i, diff, diff_N, diff_S)
            #
            # if((count-count1) != 0):
            #     print(count, count1, i, diff, diff_N, diff_S)

            if count > 0:

                #print i, patient_id, diff, count
                divergence.extend([float(diff) / float(count)])
                divergence_N.extend([float(diff_N) / float(count)])
                divergence_S.extend([float(diff_S) / float(count)])
                divergence_dN.extend(
                    [float(diff_N) / float(nonsyn_sites) / float(count)])
                divergence_dS.extend(
                    [float(diff_S) / float(syn_sites) / float(count)])

        if len(divergence) > 1:
            mean_divergence.append(np.mean(divergence))
            median_divergence.append(np.percentile(divergence, 50))
            lower_divergence_25.append(np.percentile(divergence, 25))
            upper_divergence_75.append(np.percentile(divergence, 75))
            lower_divergence_5.append(np.percentile(divergence, 5))
            upper_divergence_95.append(np.percentile(divergence, 95))
            divergence_std.append(np.std(divergence))

            mean_N_divergence.append(np.mean(divergence_N))
            median_N_divergence.append(np.percentile(divergence_N, 50))
            lower_N_divergence_25.append(np.percentile(divergence_N, 25))
            upper_N_divergence_75.append(np.percentile(divergence_N, 75))
            lower_N_divergence_5.append(np.percentile(divergence_N, 5))
            upper_N_divergence_95.append(np.percentile(divergence_N, 95))
            N_divergence_std.append(np.std(divergence_N))

            mean_S_divergence.append(np.mean(divergence_S))
            median_S_divergence.append(np.percentile(divergence_S, 50))
            lower_S_divergence_25.append(np.percentile(divergence_S, 25))
            upper_S_divergence_75.append(np.percentile(divergence_S, 75))
            lower_S_divergence_5.append(np.percentile(divergence_S, 5))
            upper_S_divergence_95.append(np.percentile(divergence_S, 95))
            S_divergence_std.append(np.std(divergence_S))

            dN.append(np.mean(divergence_dN))
            dN_med.append(np.percentile(divergence_dN, 50))
            dN_lower_25.append(np.percentile(divergence_dN, 25))
            dN_upper_75.append(np.percentile(divergence_dN, 75))
            dN_lower_5.append(np.percentile(divergence_dN, 5))
            dN_upper_95.append(np.percentile(divergence_dN, 95))
            dN_std.append(np.std(divergence_dN))

            dS.append(np.mean(divergence_dS))
            dS_med.append(np.percentile(divergence_dS, 50))
            dS_lower_25.append(np.percentile(divergence_dS, 25))
            dS_upper_75.append(np.percentile(divergence_dS, 75))
            dS_lower_5.append(np.percentile(divergence_dS, 5))
            dS_upper_95.append(np.percentile(divergence_dS, 95))
            dS_std.append(np.std(divergence_dS))

            if ("gag" in fastain):
                csvfile_gag_b.write(patient_id + "," +
                                    str(sorted_timepoints[t]) + "," +
                                    str(np.mean(divergence)) + "," +
                                    str(np.percentile(divergence, 50)) + "," +
                                    str(np.percentile(divergence, 5)) + "," +
                                    str(np.percentile(divergence, 95)) + "," +
                                    str(np.mean(divergence_N)) + "," +
                                    str(np.percentile(divergence_N, 50)) +
                                    "," + str(np.percentile(divergence_N, 5)) +
                                    "," +
                                    str(np.percentile(divergence_N, 95)) +
                                    "," + str(np.mean(divergence_S)) + "," +
                                    str(np.percentile(divergence_S, 50)) +
                                    "," + str(np.percentile(divergence_S, 5)) +
                                    "," +
                                    str(np.percentile(divergence_S, 95)) +
                                    "\n")

                csvfile_gag_b.flush()

            elif ("gp41" in fastain):
                csvfile_gp41_b.write(
                    patient_id + "," + str(sorted_timepoints[t]) + "," +
                    str(np.mean(divergence)) + "," +
                    str(np.percentile(divergence, 50)) + "," +
                    str(np.percentile(divergence, 5)) + "," +
                    str(np.percentile(divergence, 95)) + "," +
                    str(np.mean(divergence_N)) + "," +
                    str(np.percentile(divergence_N, 50)) + "," +
                    str(np.percentile(divergence_N, 5)) + "," +
                    str(np.percentile(divergence_N, 95)) + "," +
                    str(np.mean(divergence_S)) + "," +
                    str(np.percentile(divergence_S, 50)) + "," +
                    str(np.percentile(divergence_S, 5)) + "," +
                    str(np.percentile(divergence_S, 95)) + "\n")

        else:
            print "xxx", patient_id, sorted_timepoints[t]

        print patient_id, sorted_timepoints[t], len(divergence)
Esempio n. 28
0
import glob

directory = sys.argv[1] + '/grouped_reads_passing_cutoff_' + sys.argv[
    2] + '_' + sys.argv[3] + '.fasta.split/'

files_in_direct = glob.glob(directory + "*.fasta")

print('Sequences grouped...')
print("Script has detected " + str(len(files_in_direct)) +
      " files in the directory " + directory)

print('Generating consensus sequences for each UMI group...')

for file in files_in_direct:
    align = AlignIO.read(file, "fasta")
    summary_align = AlignInfo.SummaryInfo(align)
    consensus = summary_align.dumb_consensus(threshold=0.5, ambiguous='N')
    str_con = str(consensus)

    filesplit = file.split("/")
    filesplitfurther = filesplit[-1].split("_")
    filesplitevenfurther = filesplitfurther[-1].split(".")
    ID = filesplitevenfurther[0]

    output_con = open(
        sys.argv[1] + '/consensus_' + sys.argv[2] + '_ZIKV_UMI.fasta', "a")

    output_con.write(">" + ID + "\n")
    output_con.write(str_con + "\n")

    output_con.close()
Esempio n. 29
0
def number_of_N_and_S_sites(fastain, sites):
    fastaseq = AlignIO.read('%s' % fastain, 'fasta')

    sequence = AlignInfo.SummaryInfo(fastaseq).dumb_consensus().upper()

    print sequence

    site1 = 0
    site2 = 0

    bases = ['A', 'T', 'G', 'C']
    if sites == None:

        site1 = 0
        site2 = len(sequence)

    else:

        site1 = sites[0] - 1
        site2 = sites[1] - 1

    total_nonsyn = 0
    total_syn = 0

    for i in xrange(site1, site2):

        non_syn = 0
        syn = 0

        codon = sequence[0:3]
        codon_pos = 0

        if i % 3 == 0:

            codon = sequence[i:i + 3]
            codon_pos = 0

        elif i % 3 == 1:

            codon = sequence[i - 1:(i - 1) + 3]
            codon_pos = 1

        elif i % 3 == 2:

            codon = sequence[(i - 2):(i - 2) + 3]
            codon_pos == 3

        # print i, codon

        for b in bases:

            if ("X" in str(codon)):
                break
            codon_string = list(codon)
            codon_string[codon_pos] = b

            new_codon_seq = Seq("".join(codon_string))

            new_aa = new_codon_seq.translate()
            aa = codon.translate()

            if str(new_aa) == str(aa):

                syn += 1
            else:
                non_syn += 1

        total_nonsyn += float(non_syn / 4.0)
        total_syn += float(syn / 4.0)
        # print float(non_syn/4.0), float(syn/4.0)

    return total_nonsyn, total_syn
Esempio n. 30
0
          'A' : 0.25,
          'T' : 0.25}

e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ,
                                   IUPAC.unambiguous_dna)

print 'relative information:', align_info.information_content(
                                   e_freq_table = e_freq_table,
                                   chars_to_ignore = ['N'])

print 'Column 1:', align_info.get_column(1)
print 'IC for column 1:', align_info.ic_vector[1]
print 'Column 7:', align_info.get_column(7)
print 'IC for column 7:', align_info.ic_vector[7]
print 'test print_info_content'
AlignInfo.print_info_content(align_info)
print "testing reading and writing fasta format..."

to_parse = os.path.join(os.curdir, 'Quality', 'example.fasta')

alignment = AlignIO.read(open(to_parse), "fasta",
                         alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna))

# test the base alignment stuff
print 'all_seqs...'
for seq_record in alignment:
    print 'description:', seq_record.description
    print 'seq:', repr(seq_record.seq)

print 'length:', alignment.get_alignment_length()
align_info = AlignInfo.SummaryInfo(alignment)
    os.chdir(sys.argv[1])
    listing = os.listdir(".")
    consensus = {}
    genConsensus = ''
    pssmGen = ''
    # this value should be read from the arguments or else use a default
    consensusThres = 0.7
    # sys.argv[2] holds the path to the general alignment
    generalAlignment = AlignIO.parse(sys.argv[2],
                                     "fasta",
                                     alphabet=Gapped(
                                         IUPAC.ExtendedIUPACProtein(), "-"))
    lengthGenAl = 0
    positionsToMask = []
    for genAlignment in generalAlignment:
        sumGen = AlignInfo.SummaryInfo(genAlignment)
        genConsensus = sumGen.gap_consensus(consensusThres)
        for index, residue in enumerate(genConsensus):
            if genConsensus[index] == '-':
                continue
            if genConsensus[index] == 'X':
                continue
            positionsToMask.append(index)
        #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
        pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
        lengthGenAl = len(genAlignment)

    print positionsToMask

    for item in listing:
        if item.endswith(".fas"):