예제 #1
0
파일: calc_gcc.py 프로젝트: boxiangliu/rpe
genome = Fasta(genome_file)

if sys.argv[3] == 'gene':
    with open(annotation_file, 'r') as f:
        for line in f:
            if "##" in line: continue
            split_line = line.strip().split('\t')
            if split_line[2] == 'gene':
                chrom = split_line[0]
                start = int(split_line[3]) - 1
                end = int(split_line[4])
                gene_id = split_line[8].split(';')[0].replace('gene_id ',
                                                              '').replace(
                                                                  '"', '')
                gcc = GC(genome[chrom][start:end].seq)
                sys.stdout.write("%s\t%s\n" % (gene_id, gcc))

elif sys.argv[3] == 'exon':
    with open(annotation_file, 'r') as f:
        for line in f:
            if "##" in line: continue

            split_line = line.strip().split('\t')
            if split_line[2] == 'gene':
                gene_id = split_line[8].split(';')[0].replace('gene_id ',
                                                              '').replace(
                                                                  '"', '')
                try:
                    gcc = GC(seq)
                    sys.stdout.write("%s\t%s\n" % (gene_id_bak, gcc))
예제 #2
0
     plotdata = {}
 else:
     plotcreate = False
 threek_windows = [
     sequence[start:start + 3000]
     for start in range(0, len(sequence), 3000)
 ]
 windowcount = 0
 for threekindex, i in enumerate(threek_windows):
     lowgc_windows_num = 0
     hundred_windows = [
         i[start:start + 100] for start in range(0, len(i), 100)
     ]
     for hunindex, window in enumerate(hundred_windows):
         window = str(window).replace("N", "")
         GCperc = GC(window)
         if plotcreate:
             windowID = (threekindex) * 30 + hunindex + 1
             plotdata[windowID] = GCperc
         if len(
                 window
         ) > 20 and GCperc < 32:  #can be 32% as in the Diner et al 2017 paper
             lowgc_windows_num += 1
             #print("{} {}".format(windowID,window))
     if lowgc_windows_num > 10:
         result.write(">{}_candidate centromere {}-{}\n{}\n".format(
             rec.name, (threekindex) * 3000, (threekindex + 1) * 3000,
             rec.seq))
 # if plotcreate: #only runs in jupyter - graphical output
 # plotvalues = [int(plotdata[i]) for i in range(1, len(plotdata.keys()))]
 # plt.plot(range(1, len(plotdata.keys())), plotvalues)
예제 #3
0
def automated_intergenic_gc_fixer(
        genome_record,
        interval_list,
        gc_content_constraint_obj=GCContentConstraints(),
        changes_ok_in_feature_types=[],
        increase_GC=True):
    """Fixes GC content in the given interval.

    Avoids making changes within any feature annotations and 20 bp upstream
    of CDS, unless feature type is specified in changes_ok_in_feature_types.

    Args:
        genome_record: Mutable SeqRecord.
        interval_list: List of tuples in which we want to fix GC. Note that
            bases just outside this interval may be changed, as they
            contribute to the local GC measure.
        changes_ok_in_feature_types: List of feature types that we want to
            allow changes inside of.
        gc_content_constraint_obj: Provide limits on GC content.
        increase_GC: If True, increase GC. If False, decrease GC.

    Limitations:
        * Only changes bases that are not inside of any feature annotation
          or before CDS gene.
        * Only increases or decreases GC in all given intervals.
    """
    # Strategy: Sample positions in the interval and change them to
    # corresponding purine or pyrimidine. Recalculate interval GC until
    # above threshold.

    assert increase_GC, "Implementation only supports increasing GC right now."

    # Record the original length for a final assertion.
    len_genome_before_fix = len(genome_record)

    # First calculate black-listed positions that cannot be changed. These are
    # positions that are inside of feature annotations, or just upstream of CDS.
    pos_blacklist_set = _calculate_feature_annotation_shadow(
        genome_record, changes_ok_in_feature_types=changes_ok_in_feature_types)

    # We extend each target interval by half of the GC measurement window size
    # in each direction, since the we measure the GC centered at each position
    # in the interval.
    half_window = gc_content_constraint_obj.local_window_size / 2

    # R -> R, Y -> Y
    GC_TRANSITION_TABLE = {
        'A': 'G',
        'T': 'C',
        'G': 'A',
        'C': 'T',
    }

    GC_bases = 'GC'

    for interval in interval_list:
        # Extend the interval to include the epsilon on each side. We change
        # positions in this extended_interval, although we still only target
        # raising the GC of windows centered about positions in the unextended
        # interval.
        extended_interval = (interval[0] - half_window,
                             interval[1] + half_window)
        interval_size = interval[1] - interval[0]
        extended_interval_size = extended_interval[1] - extended_interval[0]
        assert interval_size + 100 == extended_interval_size

        # Extract the sequence in the interval. We'll modify this extracted
        # sequence only, and then put all the parts back together again
        # once we're done.
        before_interval_seq = genome_record.seq[:extended_interval[0]]
        orig_interval_seq = genome_record.seq[
            extended_interval[0]:extended_interval[1]]
        after_interval_seq = genome_record.seq[extended_interval[1]:]

        # Make a copy to manipulate so that we can make a comparison afterward.
        interval_seq = orig_interval_seq[:]
        assert extended_interval_size == len(interval_seq), (
            "Evaluating interval %s: \n %d != %d" %
            (str(interval), extended_interval_size, len(interval_seq)))

        # Perform the fix. March through each position, updating the interval
        # seq as necessary until that position is above the threshold.
        for interval_pos in range(extended_interval_size):
            # We only care about increasing GC of windows centered at positions
            # inside the original interval, not in the extended interval.
            if (interval_pos < half_window
                    or interval_pos >= extended_interval_size - half_window):
                continue
            # Calculate window coordinates in frame of interval_seq.
            window_start = interval_pos - half_window
            window_end = interval_pos + half_window
            window_seq = interval_seq[window_start:window_end]

            # Figure out which positions we can modify.
            pos_to_modify_list = range(window_start, window_end)
            pos_to_modify_list = [
                p
                for p in pos_to_modify_list if not _interval_pos_to_global_pos(
                    p, extended_interval) in pos_blacklist_set
            ]

            # Repeat while we have positions to modify, or until we achieve
            # the desired GC content.
            gc_content = GC(window_seq) / 100
            while (len(pos_to_modify_list) and gc_content <
                   gc_content_constraint_obj.local_window_lower_bound):
                # Choose the next position randomly to avoid bias.
                pos_to_modify_idx = random.randint(0,
                                                   len(pos_to_modify_list) - 1)
                pos_to_modify = pos_to_modify_list.pop(pos_to_modify_idx)
                current = interval_seq[pos_to_modify]
                if current.upper() in GC_bases:
                    continue
                new = GC_TRANSITION_TABLE[current]
                interval_seq = (interval_seq[:pos_to_modify] + new +
                                interval_seq[pos_to_modify + 1:])
                window_seq = interval_seq[window_start:window_end]
                gc_content = GC(window_seq) / 100

        # Put it back together.
        assert len(orig_interval_seq) == len(interval_seq), (
            "len before: %d | len after: %d" %
            (len(orig_interval_seq), len(interval_seq)))
        genome_record.seq = (before_interval_seq + interval_seq +
                             after_interval_seq)

        # Post-completion checks.
        assert len_genome_before_fix == len(genome_record), (
            "len before: %d | len after: %d" %
            (len_genome_before_fix, len(genome_record)))
예제 #4
0
 def gc(self, seq):
     """Calculate GC content in percent (0-100)."""
     return GC(seq)
예제 #5
0
        max_len = len(seq_records[i].seq)
        longest_seq = seq_records[i].id
    elif len(seq_records[i].seq) < min_len:
        # update min_len and shortest_seq
        min_len = len(seq_records[i].seq)
        shortest_seq = seq_records[i].id

print('Longest sequence is', longest_seq, 'with length', max_len, 'bp')
print('Shortest sequence is', shortest_seq, 'with length', min_len, 'bp')

# Creating a new sequence list containing sequences longer than 500bp
# Calculate the average length of these sequences
# calculate and print the percentage of GC contents

long_seq_records = list()  # empty list for sequences

total_seq_length = 0
for sequence in seq_records:
    if len(sequence) > 500:
        long_seq_records.append(sequence)
        total_seq_length += len(sequence)
        gc = GC(sequence.seq)
        print('%GC in', sequence.id, 'is {:.2f}'.format(gc))

avg_seq_length = total_seq_length / len(long_seq_records)

print('Average length for sequences longer than 500bp is', avg_seq_length)

# Write sequences in the long_seq_records in a file with 'GenBank' format
SeqIO.write(long_seq_records, 'long_sequences.fa', 'fasta')
def divergence():

    ########################
    ## Arguments d'entrée ##
    ########################
    fic1dna = sys.argv[1]  #fichier des séquences adn de l'espèce 1
    fic2dna = sys.argv[2]  #fichier des séquences adn de l'espèce 2
    fic1prot = sys.argv[3]  #fichier des séquences protéiques de l'espèce 1
    fic2prot = sys.argv[4]  #fichier des séquences protéiques de l'espèce 2

    #outfile_unaligned="outfile_unaligned.fa"
    #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8')
    outfile_dn_ds = sys.argv[5]  #fichier de sortie format tableau, sep = ";"
    outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8')
    method = sys.argv[6]  #Methode utilisée
    muscle_exe = sys.argv[7]  #Chemin vers le fichier executable de MUSCLE

    #Transformation des séquences en format SeqIO
    seq1dna = list(
        SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq2dna = list(
        SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein))
    seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein))

    #Première ligne du tableau "titres"
    """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2",
		";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)"""

    print("Nombre de paires de sequences a analyser: ", len(seq1dna))

    print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";",
          "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";",
          "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";",
          "Mean_length")
    """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2",
		"GC_content_seq1","GC_content_seq2","GC","Mean_length"))"""

    #Boucle sur chaque paire de séquence
    u = 0
    while u < (len(seq1dna)):

        try:

            ###########################################################
            #.    Alignement entre chaque paire de séquence           #
            ###########################################################

            nuc1 = str(seq1dna[u].seq
                       )  #Récupère la séquence u et la transforme en string
            nuc2 = str(seq2dna[u].seq)
            prot1 = str(seq1prot[u].seq)
            prot2 = str(seq2prot[u].seq)

            protein2 = SeqRecord(
                Seq(prot2, alphabet=IUPAC.protein), id='protein2'
            )  #Transformation de la séquence protéique en format SeqRecord
            protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),
                                 id='protein1')

            with open(
                    "outfile_unaligned.fa", "w", encoding='utf-8'
            ) as output_handle:  #Permet de créer un fichier de deux séquences non-alignées (format fasta)
                SeqIO.write(protein1, output_handle, "fasta")
                SeqIO.write(protein2, output_handle, "fasta")

            muscle_cline = MuscleCommandline(
                muscle_exe,
                input="outfile_unaligned.fa",
                out="outfile_aligned.aln"
            )  #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées
            stdout, stderr = muscle_cline()
            alns = AlignIO.read(
                "outfile_aligned.aln",
                "fasta")  #Lecture du fichier de séquences alignées

            prot1 = str(alns[0].seq)  #Récupère la séquence protéique 1 alignée
            prot2 = str(alns[1].seq)  #Récup§re la séquence protéique 2 alignée

            nuc2 = SeqRecord(
                Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2'
            )  #Transformation de la séquence nucléique en format SeqRecord
            nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()),
                             id='nuc1')

            prot1 = SeqRecord(
                Seq(prot1, alphabet=IUPAC.protein), id='pro1'
            )  #Transformation de la séquence protéique en format SeqRecord
            prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2')

            aln = MultipleSeqAlignment(
                [prot1, prot2]
            )  #Créer format alignement des 2 séquences protéiques préalablement alignées

            codon_aln = codonalign.build(
                aln, [nuc1, nuc2])  #Créer un alignement de codon

            #Fichier d'alignement
            #AlignIO.write(codon_aln,"outfile_aligned", 'fasta')

            lengthseq1 = len(nuc1.seq)
            lengthseq2 = len(nuc2.seq)
            GCcontentseq1 = GC(nuc1.seq)
            GCcontentseq2 = GC(nuc2.seq)

            GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2)

            if lengthseq1 >= lengthseq2:
                Min_length = lengthseq2
            if lengthseq1 < lengthseq2:
                Min_length = lengthseq1

            ##########################################################
            #           CALCULS DES INDICES DE DIVERGENCE            #
            ##########################################################

            #Calcul de divergence synonyme et non-synonyme

            #Supression des gaps
            seq1 = ""
            seq2 = ""
            for x, z in zip(codon_aln[0], codon_aln[1]):
                if z == "-":
                    continue
                if x == "-":
                    continue
                else:
                    seq1 += x
                    seq2 += z

            #################################################################
            #.	        Comptage du nombre de site polymorhe brute          #
            #################################################################

            #Compteur de différences par site
            compteur0 = 0
            for i, e in zip(seq1, seq2):
                if i != e:
                    compteur0 += 1

            distance_brute = round(float((compteur0) / len(seq1)), 3)

            seq1_third_pos = ""
            seq2_third_pos = ""

            compteur1 = 0
            for i in seq1[2::3]:
                if i.isalpha():
                    seq1_third_pos += i
                    compteur1 += 1

            compteur2 = 0
            for i in seq2[2::3]:
                if i.isalpha():
                    seq2_third_pos += i
                    compteur2 += 1

            ####################################################################
            #	Comptage du nombre de site polymorphe en troisième position    #
            ####################################################################

            #Compteur de différences par site (3ieme position)
            compteur3 = 0
            for i, e in zip(seq1_third_pos, seq2_third_pos):
                if i != e:
                    compteur3 += 1

            distance_third_pos = round(float((compteur3) / compteur2), 3)

            ####################################################################
            #			Calcul dN et dS selon la méthode utilisée 			   #
            ####################################################################

            try:

                dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method)
                """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos,
                      ";", distance_brute, ";", lengthseq1, ";", lengthseq2,
                      ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean,
                      ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ValueError:
                result = 9.999  #Saturation trop importante pour calculer les indices.
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ZeroDivisionError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except KeyError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            u += 1

        except:
            traceback.print_exc()
            print("Une erreur est survenue pour la sequence: ", seq1dna[u].id,
                  "vs", seq2dna[u].id)
            """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA",
		"Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)"""

            u += 1

    #df2.to_csv(outfile_dn_ds, sep='\t')
    outfile_dn_ds.close()  #Fermeture du fichier ouvert
예제 #7
0
파일: GC.py 프로젝트: kohlkopf/RosalindPrep
print "Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each). Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.\n"

from Bio import SeqIO
from Bio.SeqUtils import GC

id_gc = {}
for f in SeqIO.parse('input.fasta', 'fasta'):
    id_gc[f.id] = GC(f.seq)

print(max(id_gc, key=id_gc.get))
print(id_gc[max(id_gc, key=id_gc.get)])
예제 #8
0
def findBiasTrend(pairedEndMode, countFileLines, segmentFile):

    segmentGCDict = dict()
    segmentHexamerDict = dict()
    segmentStrandDict = dict()
    segmentPositionDict = dict()

    for record in SeqIO.parse(segmentFile, "fasta"):
        segmentGCDict[record.id] = GC(record.seq)
        segmentHexamerDict[record.id] = dict()
        segmentStrandDict[record.id] = record.description.split(" ")[1].split(
            ":")[5]
        for i in range(len(record.seq) - 6):
            hexamer = record.seq[i:i + 6]
            if hexamer in segmentHexamerDict[record.id]:
                segmentHexamerDict[record.id][hexamer] += 1
            else:
                segmentHexamerDict[record.id][hexamer] = 1

    segmentIDs = []
    segmentCountsDict = dict()
    segmentCounts = []
    segmentLengths = []
    segmentIsoforms = dict()
    geneIsoforms = []
    segmentPositionsDict = dict()

    countFileLineIndex = 0
    countFileLine = countFileLines[countFileLineIndex]

    splitLine = countFileLine.strip().split("\t")

    if pairedEndMode:
        currGene = splitLine[4]  #Get Current Gene
        while countFileLineIndex < countFileLineEnd and splitLine[
                4] == currGene:
            segmentID1 = splitLine[0]
            segmentID2 = splitLine[1]
            if segmentID1 == segmentID2:
                segmentIDs.append(segmentID1)
                segmentLengths.append(int(splitLine[5]))
                segmentIsoforms[segmentID1] = splitLine[9].split(",")
                for isoform in segmentIsoforms[segmentID1]:
                    if isoform not in geneIsoforms:
                        geneIsoforms.append(isoform)
            if segmentID1 not in segmentCountsDict:
                segmentCountsDict[segmentID1] = int(splitLine[2])
            else:
                segmentCountsDict[segmentID1] += int(splitLine[2])

            if segmentID2 not in segmentCountsDict:
                segmentCountsDict[segmentID2] = int(splitLine[2])
            else:
                segmentCountsDict[segmentID2] += int(splitLine[2])

            countFileLineIndex += 1
            if countFileLineIndex < countFileLineEnd:
                countFileLine = countFileLines[countFileLineIndex]
                splitLine = countFileLine.strip().split("\t")
    else:
        currGene = splitLine[3]  #Get Current Gene
        while countFileLineIndex < countFileLineEnd and splitLine[
                3] == currGene:
            segmentID = splitLine[0]
            segmentIDs.append(segmentID)
            segmentCountsDict[segmentID] = int(splitLine[1])
            segmentLengths.append(int(splitLine[4]))

            segmentIsoforms[segmentID] = splitLine[6].split(",")
            for isoform in segmentIsoforms[segmentID]:
                if isoform not in geneIsoforms:
                    geneIsoforms.append(isoform)

            countFileLineIndex += 1
            if countFileLineIndex < countFileLineEnd:
                countFileLine = countFileLines[countFileLineIndex]
                splitLine = countFileLine.strip().split("\t")

    for segmentID in segmentIDs:  #Only get segment counts of certain segments
        segmentCounts.append(segmentCountsDict[segmentID])

    segmentGC = [0.0 for x in range(len(segmentIDs))]
    segmentCount = [0.0 for x in range(len(segmentIDs))]

    for x in range(len(segmentIDs)):
        segmentID = segmentIDs[x]
        segmentGC[x] = segmentGCDict[segmentID]
        segmentCount[x] = math.log(segmentCountsDict[segmentID])

    plt.scatter(segmentGC, segmentCount)
    plt.show()
예제 #9
0
 def test_GC(self):
     seq = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC"
     self.assertEqual(GC(seq), 56.25)
예제 #10
0
# 46.875

# 위의 예제에서는 서열의 GC contents를 구할 수 있다. 

# 이는 전통적인 형태로써 각각 G, C의 개수를 구하여 전체 서열에서 비율을 구하면 된다. 

# ================================================================================
# 이는 다양한 함수 및 수식을 정리하여 사용해야 하지만 아래와 같이 from Bio.SeqUtils import GC를 사용하면 복잡한 수식을 사용하지 않아도 간편하게 값을 확인할 수 있다.

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC

my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna)

GC(my_seq)
# 46.875

# ================================================================================
# @@ 아래의 예제는 문자열의 슬라이싱 기능을 이용하여 서열을 자르거나 추출하는 것을 보여준다. 

# @@ 서열[x:y:z]처럼 서열을 잘라낼 수 있다. 

# @@ x는 시작위치, y는 마지막 위치, z는 범위를 나타낸다. 

# 아래의 예는 4번 bp 위치에서부터 12번 bp까지 길이 8bp의 서열을 추출하는 것이다.

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
예제 #11
0
from Bio.SeqUtils import GC
from Bio.SeqUtils import GC123
seq = input(str("DNA manual:"))
print("Total GC content:")
print(GC(seq))
print("GC by parts:")
print(GC123(seq))
input("enter")
예제 #12
0
index = [m.start() for m in re.finditer(r'AA', seq)]
index_list = []
for i in index:
    if i + 20 > seq_len:
        continue
    else:
        index_list.append(i)

pre_sirna = []
for index in index_list:
    pre_sirna.append(seq[index:index + 23])

pre_gc_sirna = []
index_gc_sirna = []
for i, seq1 in enumerate(pre_sirna):
    if (30 < round(GC(seq1), 2) <
            58) and ('AAAA' not in seq1[2:21]) and ('TTTT' not in seq1[2:21]):
        pre_gc_sirna.append(seq1)
        index_gc_sirna.append(index_list[i])

num = len(pre_gc_sirna)
num_mat = [100] * num

SS = []
for seq2 in pre_gc_sirna:
    SS.append(seq2[2:])

for i, seq2 in enumerate(SS):
    if seq2.endswith('TT'):
        num_mat[i] += 40
예제 #13
0
def result_tools3(request):
    input_seq = request.POST.get('tool1', 'default')
    rec1 = Seq(input_seq)
    ans = GC(rec1)
    params = {'res': ans}
    return render(request, 'mysite/result_tools.html', params)
from Bio.Seq import Seq
from Bio.SeqUtils import GC
exon_seq = Seq("ATGCAGTAG")
gc_contents = GC(exon_seq)
print(gc_contents)
def return_genbank_dict(gb_file, key='annotation', seq_type='amino_acid'):
    """Overview: This function will return a dictionary generated from a genbank file with key value supplied by caller.
       Returns: A dictionary created by the supplied genbank file (gb_file) indexed off the key value supplied.
       Default: The deafult key is locus, and this is generally the most useful key type since it is garanteed to be 
       unique within the genbank file. This condition is not necessarily true for any other attribute.
   """
    result = {}
    seq_record = SeqIO.parse(open(gb_file), "genbank").next()
    accession = seq_record.annotations['accessions'][0].split('.')[0]
    common_name = seq_record.annotations['organism'].replace(' ', '_')
    result.update({'accession': accession})
    result.update({'common_name': common_name})
    cnt = 0
    # loop over the genbank file
    unk_cnt = 1
    for fnum, feature in enumerate(seq_record.features):
        # here i simply check the gene coding type, and identify them in a way that can be used later.
        if feature.type == 'CDS' or feature.type == 'ncRNA' or feature.type == 'tRNA' or feature.type == 'mRNA' or feature.type == 'rRNA':
            start = feature.location.start

            stop = feature.location.end
            #print start, stop
            strand = feature.strand
            synonyms = 'NONE'
            '''
            try: 
                gene = feature.qualifiers['gene'][0]
            except:
                gene = 'unknown'
            '''

            # this line might be wrong, just trying to get rid of an unneccessary try/except clause
            if 'gene' in feature.qualifiers:
                gene = feature.qualifiers['gene'][0]
            else:
                gene = 'unknown'

            if 'gene_synonym' in feature.qualifiers:
                synonym_list = feature.qualifiers['gene_synonym'][0].replace(
                    ' ', '').split(';')
                synonyms = ':'.join(synonym_list)
            try:
                locus = feature.qualifiers['locus_tag'][0]
            except:
                try:
                    locus = feature.qualifiers['gene'][0]
                except:
                    locus = ''
                    print 'No locus associated. This should never be invoked meaning you are proper fracked. (The gbk file has an error).'
            try:
                seq = feature.qualifiers['translation']
                seq_type = 'Protein'
            except:
                cnt = cnt + 1
                seq = seq_record.seq[start:stop]
                seq_type = feature.type
                if feature.type == 'CDS':
                    seq_type = 'Pseudo_Gene'
                    # attempt to fix an error
                    if strand == 1:
                        seq = seq.translate()
                    else:
                        seq = seq.reverse_complement().translate()
            gc = "%2.1f" % GC(seq_record.seq[start:stop])
            # Debugging something odd

            #print feature.qualifiers['gene_synonym']
            #method = "exact"
            if key == 'locus':
                result.update({locus: (locus, gene, seq, seq_type, synonyms)})
            elif key == 'annotation':
                if gene == 'unknown':
                    new_gene = 'unknown_' + str(unk_cnt)
                    header = '|'.join([
                        accession, common_name, locus, gene,
                        str(start),
                        str(stop),
                        str(strand), seq_type, synonyms, gc
                    ])
                    result.update({new_gene: [header, ''.join(seq)]})
                    unk_cnt += 1
                else:
                    header = '|'.join([
                        accession, common_name, locus, gene,
                        str(start),
                        str(stop),
                        str(strand), seq_type, synonyms, gc
                    ])
                    result.update({gene: [header, ''.join(seq)]})
                    try:
                        for syn in synonym_list:
                            result.update({syn: [header, ''.join(seq)]})
                    except:
                        pass

    #print 'The number of non-protein regions in %s is: %i.' % (common_name, cnt)
    return result
예제 #16
0
def main():
    for seq_record in SeqIO.parse(sys.argv[1], "fasta"):
        print(seq_record.id)
        print(len(seq_record.seq))
        print(GC(seq_record.seq))
예제 #17
0
import Bio
from Bio import SeqIO
from Bio.SeqUtils import GC

records = list(SeqIO.parse("sample_data/ls_orchid.fasta", "fasta"))

for dna_rec in records[:10]:
    print(dna_rec.description)
    dna = dna_rec.seq[:100]
    print("GC Content: " + str(GC(dna)) + "%")
    print("")
예제 #18
0
def circos_gc_var(record, windows=1000, shift=0):
    '''
    :param record:
    :return: circos string with difference as compared to the average GC
    ex: average = 32
        GC(seq[3000:4000]) = 44
        diff = 44 - 32 = 12%




    '''
    circos_string = ''
    from Bio.SeqFeature import FeatureLocation
    average_gc = GC(record.seq)
    gap_locations = []
    for feature in record.features:
        if feature.type == "assembly_gap":
            gap_locations.append(feature.location)
    if len(gap_locations) == 0:
        gap_locations.append(FeatureLocation(0, len(record.seq)))
    else:
        gap_locations.append(
            FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))
    if len(gap_locations) > 1:
        #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))

        for i in range(0, len(gap_locations)):
            if i == 0:
                seq = record.seq[0:gap_locations[i].start]
                chr_start = 0
            else:
                seq = record.seq[gap_locations[i -
                                               1].end:gap_locations[i].start]
                chr_start = gap_locations[i - 1].end
            contig_name = record.name + "_%s" % (i + 1)

            if len(seq) < windows:
                windows_range = len(seq)
                #print seq
            else:
                windows_range = windows
            if len(seq) == 0:
                continue
            #print record
            for i in range(0, len(seq), windows_range):
                start = i
                stop = i + windows
                #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
                if 'n' in record.seq[start:stop]:
                    continue
                if 'N' in record.seq[start:stop]:
                    continue
                gc = GC(record.seq[start:stop]) - average_gc
                if stop > len(seq):
                    stop = len(seq)
                    #if stop - start < 200:
                    #    break
                section_start = chr_start + start
                section_end = chr_start + stop
                circos_string += "%s %s %s %s\n" % (contig_name, section_start,
                                                    section_end, gc)
    else:
        seq = record.seq
        contig_name = record.id.split('.')[0]
        for i in range(0, len(seq), windows):
            start = i
            stop = i + windows
            #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
            gc = GC(record.seq[start:stop]) - average_gc
            if stop > len(seq):
                stop = len(seq)
                if stop - start < 500:
                    break
            circos_string += "%s %s %s %s\n" % (contig_name, start + shift,
                                                stop + shift, gc)
    return circos_string
예제 #19
0
              ' or it is unreadable')
        quit()

#input_fasta_path = sys.argv[1]
#output_table_path = sys.argv[2]

output = open(output_table_path, 'w')

if no_coverage_mode:
    output.write('contig\tlength\tgc\n')
else:
    output.write('contig\tlength\tgc\tcov\n')

for seq_record in SeqIO.parse(fasta_assembly_path, 'fasta'):
    length = str(len(seq_record))
    gc = str(GC(str(seq_record.seq)))
    contig = str(seq_record.id)
    if use_coverage_table:
        if contig in coverages:
            cov = coverages[contig]
        else:
            print('Error, ' + contig + ' not found in the coverage table')
            quit()
        output.write(contig + '\t' + length + '\t' + gc + '\t' + str(cov) +
                     '\n')
    elif no_coverage_mode:
        output.write(contig + '\t' + length + '\t' + gc + '\n')
    else:
        # Do a format check to make sure the contig name is right
        contigList = contig.split('_')
        if contigList[0] == 'NODE' and contigList[
예제 #20
0
def gc_values(handle):
    parsed_handle = [record for record in SeqIO.parse(handle, "fasta")]
    gc_values = [GC(rec.seq) for rec in parsed_handle]
    seq_ids = [rec.id for rec in parsed_handle]
    for i in range(0, len(seq_ids)):
        print(seq_ids[i] + "\t" + str(gc_values[i]))
def dna_features(dna_sequences):
    """
    This function calculates a variety of properties from a DNA sequence.
    
    Input: a list of DNA sequence (can also be length of 1)
    Output: a dataframe of features
    """

    import numpy as np
    import pandas as pd
    from Bio.SeqUtils import GC, CodonUsage

    A_freq = []
    T_freq = []
    C_freq = []
    G_freq = []
    GC_content = []
    codontable = {
        'ATA': [],
        'ATC': [],
        'ATT': [],
        'ATG': [],
        'ACA': [],
        'ACC': [],
        'ACG': [],
        'ACT': [],
        'AAC': [],
        'AAT': [],
        'AAA': [],
        'AAG': [],
        'AGC': [],
        'AGT': [],
        'AGA': [],
        'AGG': [],
        'CTA': [],
        'CTC': [],
        'CTG': [],
        'CTT': [],
        'CCA': [],
        'CCC': [],
        'CCG': [],
        'CCT': [],
        'CAC': [],
        'CAT': [],
        'CAA': [],
        'CAG': [],
        'CGA': [],
        'CGC': [],
        'CGG': [],
        'CGT': [],
        'GTA': [],
        'GTC': [],
        'GTG': [],
        'GTT': [],
        'GCA': [],
        'GCC': [],
        'GCG': [],
        'GCT': [],
        'GAC': [],
        'GAT': [],
        'GAA': [],
        'GAG': [],
        'GGA': [],
        'GGC': [],
        'GGG': [],
        'GGT': [],
        'TCA': [],
        'TCC': [],
        'TCG': [],
        'TCT': [],
        'TTC': [],
        'TTT': [],
        'TTA': [],
        'TTG': [],
        'TAC': [],
        'TAT': [],
        'TAA': [],
        'TAG': [],
        'TGC': [],
        'TGT': [],
        'TGA': [],
        'TGG': []
    }

    for item in dna_sequences:
        # nucleotide frequencies
        A_freq.append(item.count('A') / len(item))
        T_freq.append(item.count('T') / len(item))
        C_freq.append(item.count('C') / len(item))
        G_freq.append(item.count('G') / len(item))

        # GC content
        GC_content.append(GC(item))

        # codon frequency: count codons, normalize counts, add to dict
        codons = [item[i:i + 3] for i in range(0, len(item), 3)]
        l = []
        for key in codontable.keys():
            l.append(codons.count(key))
        l_norm = [float(i) / sum(l) for i in l]

        for j, key in enumerate(codontable.keys()):
            codontable[key].append(l_norm[j])

    # codon usage bias (_b)
    synonym_codons = CodonUsage.SynonymousCodons
    codontable2 = {
        'ATA_b': [],
        'ATC_b': [],
        'ATT_b': [],
        'ATG_b': [],
        'ACA_b': [],
        'ACC_b': [],
        'ACG_b': [],
        'ACT_b': [],
        'AAC_b': [],
        'AAT_b': [],
        'AAA_b': [],
        'AAG_b': [],
        'AGC_b': [],
        'AGT_b': [],
        'AGA_b': [],
        'AGG_b': [],
        'CTA_b': [],
        'CTC_b': [],
        'CTG_b': [],
        'CTT_b': [],
        'CCA_b': [],
        'CCC_b': [],
        'CCG_b': [],
        'CCT_b': [],
        'CAC_b': [],
        'CAT_b': [],
        'CAA_b': [],
        'CAG_b': [],
        'CGA_b': [],
        'CGC_b': [],
        'CGG_b': [],
        'CGT_b': [],
        'GTA_b': [],
        'GTC_b': [],
        'GTG_b': [],
        'GTT_b': [],
        'GCA_b': [],
        'GCC_b': [],
        'GCG_b': [],
        'GCT_b': [],
        'GAC_b': [],
        'GAT_b': [],
        'GAA_b': [],
        'GAG_b': [],
        'GGA_b': [],
        'GGC_b': [],
        'GGG_b': [],
        'GGT_b': [],
        'TCA_b': [],
        'TCC_b': [],
        'TCG_b': [],
        'TCT_b': [],
        'TTC_b': [],
        'TTT_b': [],
        'TTA_b': [],
        'TTG_b': [],
        'TAC_b': [],
        'TAT_b': [],
        'TAA_b': [],
        'TAG_b': [],
        'TGC_b': [],
        'TGT_b': [],
        'TGA_b': [],
        'TGG_b': []
    }

    for item1 in dna_sequences:
        codons = [item1[l:l + 3] for l in range(0, len(item1), 3)]
        codon_counts = []

        # count codons corresponding to codontable (not codontable2 because keynames changed!)
        for key in codontable.keys():
            codon_counts.append(codons.count(key))

        # count total for synonymous codons, divide each synonym codon count by total
        for key_syn in synonym_codons.keys():
            total = 0
            for item2 in synonym_codons[key_syn]:
                total += codons.count(item2)
            for j, key_table in enumerate(codontable.keys()):
                if (key_table in synonym_codons[key_syn]) & (total != 0):
                    codon_counts[j] /= total

        # add corrected counts to codontable2 (also corresponds to codontable which was used to count codons)
        for k, key_table in enumerate(codontable2.keys()):
            codontable2[key_table].append(codon_counts[k])

    # make new dataframes & standardize
    features_codonbias = pd.DataFrame.from_dict(codontable2)
    features_dna = pd.DataFrame.from_dict(codontable)
    features_dna['A_freq'] = np.asarray(A_freq)
    features_dna['T_freq'] = np.asarray(T_freq)
    features_dna['C_freq'] = np.asarray(C_freq)
    features_dna['G_freq'] = np.asarray(G_freq)
    features_dna['GC'] = np.asarray(GC_content)

    # concatenate dataframes & return
    features = pd.concat([features_dna, features_codonbias], axis=1)
    return features
예제 #22
0
def whole_gc(records):
    seq = ""
    for record in records:
        seq += record.seq
    return GC(seq)
예제 #23
0
#Biopython

from Bio import SeqIO
from Bio.SeqUtils import GC

GCcont = 0
ID = ""

file = open("rosalind_GC.txt", "r")
for record in SeqIO.parse(file, "fasta"):
    if GCcont < GC(record.seq):
        GCcont = GC(record.seq)
       ID = record.id

print(ID)
print(str(round(GCcont,2))+"%")



#dictionary

file = open('rosalind_gc.txt', 'r').read()

d = {}

for seqblock in raw.split(">")[1:]:
  parts = seqblock.split("\n")
  id = parts[0]
  seq = ''.join(parts[1:])
  gc = 100 * ( seq.count("G") + seq.count("C") ) / float(len(seq))
  dic[gc] = id
예제 #24
0
def circos_gc_content(record, windows=1000, shift=0):
    '''
    :param record:
    :return: circos string with difference as compared to the average GC
    ex: average = 32
        GC(seq[3000:4000]) = 44
        diff = 44 - 32 = 12%

    UPDATE 12.06.2017: calculs based on complete (concatenated) sequence, then converted to draft contigs coords
    '''
    circos_string = ''
    from Bio.SeqFeature import FeatureLocation
    average_gc = GC(record.seq)
    gap_locations = []
    for feature in record.features:
        if feature.type == "assembly_gap":
            gap_locations.append(feature.location)
    if len(gap_locations) == 0:
        gap_locations.append(FeatureLocation(0, len(record.seq)))
    else:
        gap_locations.append(
            FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))
    if len(gap_locations) > 1:
        #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))

        for i in range(0, len(gap_locations)):
            if i == 0:
                seq = record.seq[0:gap_locations[i].start]
                chr_start = 0
            else:
                seq = record.seq[gap_locations[i -
                                               1].end:gap_locations[i].start]
                chr_start = gap_locations[i - 1].end
            contig_name = record.name + "_%s" % (i + 1)
            if len(seq) <= windows:
                window_range = len(seq)
                #print 'small contig!!!!!!!!!!', len(seq)
            else:
                #print 'len contig', len(seq)
                window_range = windows
            for i in range(0, len(seq), window_range):
                start = i
                stop = i + windows
                #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
                gc = GC(record.seq[start:stop])
                if stop > len(seq):

                    stop = len(seq)
                    #print "small contig!!", start, stop, gc
                    #if stop - start < 200:
                    #    #break
                section_start = chr_start + start
                section_end = chr_start + stop
                circos_string += "%s %s %s %s\n" % (contig_name, section_start,
                                                    section_end, gc)

    else:
        seq = record.seq
        contig_name = record.id  #.split('.')[0]
        for i in range(0, len(seq), windows):
            start = i
            stop = i + windows
            #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
            gc = GC(record.seq[start:stop])
            if stop > len(seq):
                stop = len(seq)
                if stop - start < 500:
                    break
            circos_string += "%s %s %s %s\n" % (contig_name, start + shift,
                                                stop + shift, gc)
    return circos_string
예제 #25
0
from Bio import SeqIO
from Bio.SeqUtils import GC
import pylab

for i in SeqIO.parse("NC_017108.gbk", "genbank"):
    gc = GC(i.seq)

at = 100 - gc

pylab.pie([gc, at])
pylab.title("Conteudo GC:")
pylab.xlabel("GC: %0.1f porcento\nAT: %0.1f porcento" % (gc, at))
pylab.show()
예제 #26
0
def adapter_find(reference_database, reads, threads, max_intron_length,
                 working_dir, verbose):
    subset_fasta = reads + "subset.10000.fasta"
    count_reads = 0
    with open(subset_fasta, "w") as fh:
        for rec in SeqIO.parse(reads, "fasta"):
            if int(rec.id) < 10000:
                SeqIO.write(rec, fh, "fasta")

    bam = mapping.minimap(reference_database, subset_fasta, threads,
                          max_intron_length, working_dir, verbose)
    #soft_clip_regions = soft_clip(bam)
    fasta_gz = bam + ".fasta.gz"
    cmd = "extractSoftclipped %s > %s" % (bam, fasta_gz)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True)
    extract_clip.communicate()

    list_short = []
    list_long = []
    dict_uniq = {}
    with gzip.open(fasta_gz, "rt") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            name_seq = str(rec.id)
            name = name_seq.split("_")[0]
            if name in dict_uniq:
                if len(dict_uniq[name].seq) > len(rec.seq):
                    list_long.append(dict_uniq[name])
                    list_short.append(rec)
                else:
                    list_short.append(dict_uniq[name])
                    list_long.append(rec)
            else:
                dict_uniq[name] = rec
    long_file = fasta_gz + ".long.fasta"
    with open(long_file, "w") as fh:
        SeqIO.write(list_long, fh, "fasta")
    short_file = fasta_gz + ".short.fasta"
    with open(short_file, "w") as fh:
        SeqIO.write(list_short, fh, "fasta")
    list_file_clip = [(long_file, "long"), (short_file, "short")]

    list_file_adapter = []
    for clip_file in list_file_clip:
        kmer_start = 21
        list_kmer = []
        while kmer_start < 120:
            cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % (
                kmer_start, kmer_start, clip_file[1], clip_file[0])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True)
            jelly_count.communicate()
            cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % (
                kmer_start, clip_file[1])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_dump = subprocess.Popen(cmd,
                                          cwd=working_dir,
                                          stdout=subprocess.PIPE,
                                          shell=True)
            out_dump = jelly_dump.communicate()[0].decode('utf-8')
            mer = out_dump.split("\t")[0]
            a_count = mer.count("A")
            t_count = mer.count("T")
            if a_count > t_count:
                bias_count = a_count
            else:
                bias_count = t_count
            data_kmer = (kmer_start, mer, GC(mer),
                         (bias_count / kmer_start) * 100,
                         (bias_count / kmer_start) * 100 - GC(mer))
            list_kmer.append(data_kmer)
            kmer_start += 5

        value_adapter = 0
        for i in list_kmer:
            if i[4] > int(value_adapter):
                value_adapter = i[4]
                kmer_done = i[1]
    adapter_file = os.path.join(working_dir, "adapter.fasta")
    with open(adapter_file, "w") as fh:
        record = SeqRecord(Seq(str(kmer_done)), id="adapter")
        SeqIO.write(record, fh, "fasta")
    return adapter_file
예제 #27
0
def fix_gc_content(refactor_context,
                   gc_content_constraint_obj,
                   start_bound=None,
                   end_bound=None,
                   debug=False,
                   report_file=None):
    """Fixes the GC content according to desired constraints.

    Strategy:
        Slide a window across the genome and bump any regions that fall
        outside of the constraint. Now, there is some subtlety here in that
        we have a good idea of how to fix coding regions (i.e. synonymous
        codon swaps), but want to avoid messing with stuff outside of
        coding regions. And so when we identify a bad window, for now,
        we limit fixes to any coding portions of that window only.

    TODOs:
        * We are only dealing with local window for now. Figure out how we want
        to deal with global window.

    Args:
        refactor_context: The RefactorContext.
        gc_content_constraint_obj: A GCContentConstraints object that
            allows the client to configure the fixes.
        start_bound: Optionally bound fixes to start at this position.
        end_bound: Optionally bound fixes to end at this position.
        debug: Debug flag. Prints helpful output. For now, runs analysis only,
            and doesn't actually make changes.

    Returns:
        A copy of the genome_record contained within refactor_context
        with the GC content made to satisfy constraints.
    """
    print 'Fixing GC content...'
    updated_genome_record = copy.deepcopy(refactor_context.get_genome_record())

    # Figure out effective bounds.
    effective_start_bound = start_bound if start_bound else 0
    effective_end_bound = end_bound if end_bound else len(
        updated_genome_record)

    # Features that we can do synonymous swaps in
    swappable_features = [
        feature for feature in updated_genome_record.features
        if feature.type == 'CDS'
    ]

    # Slide the window looking for violations of GC content restrictions.
    window_center_range = range(
        effective_start_bound +
        gc_content_constraint_obj.local_window_size / 2,
        effective_end_bound - gc_content_constraint_obj.local_window_size / 2)
    report_intervals = []
    if debug:
        running_interval = None
        running_gc_total = 0
    for window_center_pos in window_center_range:
        window_start_pos = (window_center_pos -
                            gc_content_constraint_obj.local_window_size / 2)
        window_end_pos = (window_start_pos +
                          gc_content_constraint_obj.local_window_size)
        window_seq = updated_genome_record.seq[window_start_pos:window_end_pos]
        gc_content = GC(window_seq) / 100
        if (gc_content_constraint_obj.local_window_lower_bound <= gc_content <=
                gc_content_constraint_obj.local_window_upper_bound):
            # GC is all good.
            if debug:
                # Close the running interval and print it out.
                if running_interval:
                    interval_size = running_interval[1] - running_interval[
                        0] + 1
                    avg_gc = running_gc_total / interval_size
                    report_intervals.append({
                        'interval': str(running_interval),
                        'interval_size': interval_size,
                        'avg_gc': avg_gc,
                    })
                    print('%s, size: %d, average_gc: %f' %
                          (str(running_interval), interval_size, avg_gc))
                    running_interval = None
                    running_gc_total = 0
            continue

        if debug:
            if not running_interval:
                running_interval = (window_center_pos, window_center_pos)
            else:
                running_interval = (running_interval[0], window_center_pos)
            running_gc_total += gc_content
            continue

        # As a first stab, only attempt fixes in the simplest of cases.
        # That is, only do synonymous codon swaps within parts of features
        # that are not overlapping.

        # First identify all features overlaped by the interval.
        interval = (window_start_pos, window_end_pos)
        overlapped_features = calc_interval_list_to_features_overlapped(
            [interval], swappable_features)[0]
        if len(overlapped_features) != 1:
            # TODO: Eventually handle more complex cases.
            continue

        # Otherwise attempt to fix.
        feature = overlapped_features[0]
        feature_seq = str(feature.extract(updated_genome_record.seq))

        # Figure out the specific codons that need to be changed.
        affected_codon_indeces = get_region_codon_indeces_in_feature(
            feature, interval)
        avoid_codons_in_positions = {}
        for codon_index in affected_codon_indeces:
            codon = feature_seq[codon_index * 3:codon_index * 3 + 3]
            if GC(codon) < 1.0:
                avoid_codons_in_positions[codon_index] = codon

        # Perform replace.
        first_codon_to_modify = affected_codon_indeces[0]
        last_codon_to_modify = affected_codon_indeces[-1]
        assert first_codon_to_modify <= last_codon_to_modify
        result = replace_codons_in_single_feature(
            refactor_context,
            feature.id,
            explicit_genome_record=updated_genome_record,
            start_codon_index=first_codon_to_modify,
            last_codon_index=last_codon_to_modify,
            avoid_codons_in_positions=avoid_codons_in_positions)
        if not result['is_success']:
            # TODO: Do something better for debugging here, although
            # we don't necessarily need each replace to succeed.
            continue

        update_seq_record_feature(updated_genome_record, feature.id, result)

    print '...Done.'

    if report_file:
        print 'Writing report.'
        REPORT_FIELDNAMES = [
            'interval',
            'interval_size',
            'avg_gc',
        ]
        with open(report_file, 'w') as report_fh:
            writer = csv.DictWriter(report_fh, REPORT_FIELDNAMES)
            writer.writeheader()
            for interval in report_intervals:
                writer.writerow(interval)

    return updated_genome_record
예제 #28
0
else:
    printlog("\t\tUnable to determine a likely cluster.")

printlog("\tBase One Guess")
if base_ones:
    for base_one in base_ones:
        out = "\t\tIn the blast hit to %s, query position %s matches subject position %s." % (base_one[0], str(base_one[1]), str(base_one[2]))
        out2 = "\t\tLikely Base 1 position: %s in %s" % (base_one[1], base_one[4]) 
        if base_one[3]:
            out += "  (After contig was reverse-complemented.)"
        printlog(out)
        printlog(out2)
else:
    printlog("\t\tUnable to find Base 1.")

printlog("\tGC Info")
i=0
all_contig_objects=SeqIO.parse(open('%s/454AllContigs.fna' % project_dir,'r'),'fasta')
for contig in all_contig_objects:
    if i==10:
        break
    printlog("\t\t%s\t%s %%" % (contig.id, round(GC(contig.seq),1)))
    i += 1

printlog("\tCoverage Info")
for contig in contig_list:
    printlog("\t\t%s\t%s (assembled)\t%s (estimated for entire fastq)" % (contig[0],contig[4],contig[5])) 

log_file.close()

예제 #29
0
def find_gc_content_extremes(genome_record,
                             gc_content_constraint_obj=GCContentConstraints(),
                             start_bound=None,
                             end_bound=None,
                             debug=False):
    """Finds runs of extreme GC content.

    Args:
        genome_record: The SeqRecord object with the sequence.
        gc_content_constraint_obj: A GCContentConstraints object that
            allows the client to configure the fixes.
        start_bound: Optionally bound fixes to start at this position.
        end_bound: Optionally bound fixes to end at this position.

    Returns:
        List of objects with keys:
            * interval: Pythonic (start, end) of the interval.
            * avg_gc: Average GC content over this interval.
    """
    extreme_gc_intervals = []

    effective_start_bound = start_bound if start_bound else 0
    effective_end_bound = end_bound if end_bound else len(genome_record.seq)

    running_interval = None
    running_gc_total = 0

    # Slide the window looking for violations of GC content restrictions.
    window_center_range = xrange(
        effective_start_bound +
        gc_content_constraint_obj.local_window_size / 2,
        effective_end_bound - gc_content_constraint_obj.local_window_size / 2)
    # Necessary initialization for our get_GC_optimized() method.
    gc_content = None
    for window_center_pos in window_center_range:
        window_start_pos = (window_center_pos -
                            gc_content_constraint_obj.local_window_size / 2)
        window_end_pos = (window_start_pos +
                          gc_content_constraint_obj.local_window_size)
        gc_content = GC(genome_record.seq, window_start_pos, window_end_pos,
                        gc_content)
        if (gc_content_constraint_obj.local_window_lower_bound <= gc_content <=
                gc_content_constraint_obj.local_window_upper_bound):
            # End of extreme interval. Record the current interval and reset.
            if running_interval:
                interval_size = running_interval[1] - running_interval[0] + 1
                avg_gc = running_gc_total / interval_size
                extreme_gc_intervals.append({
                    'interval': running_interval,
                    'avg_gc': avg_gc
                })

                # Reset.
                running_interval = None
                running_gc_total = 0
        else:
            # Create or update the running interval.
            if not running_interval:
                running_interval = (window_center_pos, window_center_pos)
            else:
                running_interval = (running_interval[0], window_center_pos)
            running_gc_total += gc_content

    return extreme_gc_intervals
예제 #30
0
def main(argv):
    
    #default parameters
    mg_lst = []
    ref_lst = []
    e_val = 1e-5
    alen = 50.0
    alen_percent = True
    alen_bp = False
    iden = 95.0
    name= "output"
    fmt_lst = ["fasta"]
    supported_formats =["fasta", "csv"]
    iterations = 1
    alen_increment = 5.0
    iden_increment = 0.0
    blast_db_Dir = ""
    results_Dir = ""
    input_files_Dir = ""
    ref_out_0 = ""
    blasted_lst = []
    continue_from_previous = False #poorly supported, just keeping the directories
    skip_blasting = False
    debugging = False
    sheared = False
    shear_val = None
    logfile = ""
    
    
             
    try:                                
        opts, args = getopt.getopt(argv, "r:m:n:e:a:i:s:f:h", ["reference=", "metagenome=", "name=", "e_value=", "alignment_length=", "identity=","shear=","format=", "iterations=", "alen_increment=", "iden_increment=","continue_from_previous","skip_blasting","debugging", "help"])
    except getopt.GetoptError:          
        usage()                         
        sys.exit(2)                     
    for opt, arg in opts:                
        if opt in ("-h", "--help"):      
            usage()              
            sys.exit() 
#        elif opt in ("--recover_after_failure"):
#            recover_after_failure = True
#            print "Recover after failure:", recover_after_failure  

        elif opt in ("--continue_from_previous"):
            continue_from_previous = True
            if debugging:
                print "Continue after failure:", continue_from_previous
        elif opt in ("--debugging"):
            debugging = True
            if debugging:
                print "Debugging messages:", debugging   
                     
        elif opt in ("-r", "--reference"):
            if arg:
                ref_lst=arg.split(',')
                #infiles = arg
            if debugging:
                print "Reference file(s)", ref_lst  
        elif opt in ("-m", "--metagenome"):
            if arg:
                mg_lst=arg.split(',')
                #infiles = arg
            if debugging:
                print "Metagenome file(s)", mg_lst
            
        elif opt in ("-f", "--format"):
            if arg:
                fmt_lst=arg.split(',')
                #infiles = arg
            if debugging:
                print "Output format(s)", fmt_lst
        
        elif opt in ("-n", "--name"):
            if arg.strip():              
                name = arg
            if debugging:
                print "Project name", name 
            
        elif opt in ("-e", "--e_value"):
            try:
                e_val = float(arg)
            except:
                print "\nERROR: Please enter numerical value as -e parameter (default: 1e-5)"
                usage()
                sys.exit(1)
            if debugging:
                print "E value", e_val
            
        elif opt in ("-a", "--alignment_length"):
            if arg.strip()[-1]=="%":
                alen_bp = False
                alen_percent = True
            else:
                alen_bp = True
                alen_percent = False
                
            try:
                alen = float(arg.split("%")[0])
            except:
                print "\nERROR: Please enter a numerical value as -a parameter (default: 50.0)"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", alen           
            
        elif opt in ("-i", "--identity"):
            try:
                iden = float(arg)
            except:
                print "\nERROR: Please enter a numerical value as -i parameter (default: 95.0)"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", iden    
        elif opt in ("-s", "--shear"):
            sheared = True
            try:
                shear_val = int(arg)
            except:
                print "\nERROR: Please enter an integer value as -s parameter"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", iden 
        elif opt in ("--iterations"):
            try:
                iterations = int(arg)
            except:
                
                print "\nWARNING: Please enter integer value as --iterations parameter (using default: 1)"
            if debugging:
                print "Iterations: ", iterations  
            
        elif opt in ("--alen_increment"):
            
            try:
                alen_increment = float(arg)
            except:
                print "\nWARNING: Please enter numerical value as --alen_increment parameter (using default: )", alen_increment
            if debugging:
                print "Alignment length increment: ", alen_increment 
 
        elif opt in ("--iden_increment"):
            
            try:
                iden_increment = float(arg)
            except:
                print "\nWARNING: Please enter numerical value as --iden_increment parameter (using default: )", iden_increment
            if debugging:
                print "Alignment length increment: ", iden_increment 
        elif opt in ("--skip_blasting"):
            skip_blasting = True
            if debugging:
                print "Blasting step omitted; Using previous blast output."
            
    for ref_file in [x for x in ref_lst if x]:
        try:
            #
            with open(ref_file, "rU") as hand_ref:
                pass
        except:
            print "\nERROR: Reference File(s) ["+ref_file+"] doesn't exist"
            usage()
            sys.exit(1)

            
    for mg_file in [x for x in mg_lst if x]:
        try:
            #
            with open(mg_file, "rU") as hand_mg:
                pass
        except:
            print "\nERROR: Metagenome File(s) ["+mg_file+"] doesn't exist"
            usage()
            sys.exit(1) 
            
    for fmt in [x for x in fmt_lst if x]:
        if fmt not in supported_formats:
            print "\nWARNING: Output format [",fmt,"] is not supported"
            print "\tUse -h(--help) option for the list of supported formats"
            fmt_lst=["fasta"]
            print "\tUsing default output format: ", fmt_lst[0]
 
    
    project_dir = name
    if not continue_from_previous:
        if os.path.exists(project_dir):
            shutil.rmtree(project_dir)
        try:
            os.mkdir(project_dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + name
            raise
    
    print "\n\t Initial Parameters:"
    print "\nProject Name: ", name,'\n'
    print "Project Directory: ", os.path.abspath(name),'\n'
    print "Reference File(s): ", ref_lst,'\n'
    if sheared:
        print "Shear Reference File(s):", str(shear_val)+"bp",'\n'
    print "Metagenome File(s): ", mg_lst,'\n'
    print "E Value: ", e_val, "\n"
    if alen_percent:
        print "Alignment Length: "+str(alen)+'%\n'
    if alen_bp:
        print "Alignment Length: "+str(alen)+'bp\n'
    print "Sequence Identity: "+str(iden)+'%\n'
    print "Output Format(s):", fmt_lst,'\n'
    if iterations > 1:
        print "Iterations: ", iterations, '\n'
        print "Alignment Length Increment: ", alen_increment, '\n'
        print "Sequence identity Increment: ", iden_increment, '\n'

    #Initializing directories
    blast_db_Dir = name+"/blast_db"
    if not continue_from_previous:
        if os.path.exists(blast_db_Dir):
            shutil.rmtree(blast_db_Dir)
        try:
            os.mkdir(blast_db_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + blast_db_Dir
            raise

    results_Dir = name+"/results"
    if not continue_from_previous:
    
        if os.path.exists(results_Dir):
            shutil.rmtree(results_Dir)
        try:
            os.mkdir(results_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + results_Dir
            raise

    input_files_Dir = name+"/input_files"
    if not continue_from_previous:
    
        if os.path.exists(input_files_Dir):
            shutil.rmtree(input_files_Dir)
        try:
            os.mkdir(input_files_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + input_files_Dir
            raise

# Writing raw reference files into a specific input filename
    input_ref_records = {}
    for reference in ref_lst:
        ref_records_ind = parse_contigs_ind(reference)
        #ref_records = dict(ref_records_ind)
        input_ref_records.update(ref_records_ind)
        ref_records_ind.close()
        #input_ref_records.update(ref_records)
        
    ref_out_0 = input_files_Dir+"/reference0.fna"
    if (sheared & bool(shear_val)):
        with open(ref_out_0, "w") as handle:
            SeqIO.write(genome_shredder(input_ref_records, shear_val).values(), handle, "fasta")

            #NO NEED TO CLOSE with statement will automatically close the file
    else:
        with open(ref_out_0, "w") as handle:
            SeqIO.write(input_ref_records.values(), handle, "fasta")

# Making BLAST databases
    #output fname from before used as input for blast database creation
    input_ref_0 = ref_out_0
    title_db = name+"_db"#add iteration functionality
    #diamond
    os.mkdir(blast_db_Dir+"/iteration"+str(iterations))
    outfile_db = blast_db_Dir+"/iteration"+str(iterations)+"/db" #change into for loop
    #os.system("makeblastdb -in "+input_ref_0+" -dbtype prot -title "+title_db+" -out "+outfile_db+" -parse_seqids")
    os.system("diamond makedb --in "+input_ref_0+" --db "+outfile_db)
    
# BLASTing query contigs
    if not skip_blasting:
        print "\nBLASTing query file(s):"
        for i in range(len(mg_lst)):
            
            database = outfile_db # adjust for iterations
            blasted_lst.append(results_Dir+"/recruited_mg_"+str(i)+".tab")
            start = time.time()
            #os_string = 'blastp -db '+database+' -query \"'+mg_lst[i]+'\" -out '+blasted_lst[i]+" -evalue "+str(e_val)+"  -outfmt 6 -num_threads 8"
            os_string = 'diamond blastp -p 8 -f 6 -d '+database+'.dmnd -q \"'+mg_lst[i]+'\" --out '+blasted_lst[i]+" --evalue "+str(e_val)
            #print os_string
            os.system(os_string)
            print "\t"+mg_lst[i]+"; Time elapsed: "+str(time.time()-start)+" seconds."
    else:
        for i in range(len(mg_lst)):
            blasted_lst.append(results_Dir+"/recruited_mg_"+str(i)+".tab")
        
        
# Parsing BLAST outputs
    blast_cols = ['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']
    recruited_mg=[]
    for i in range(len(mg_lst)):
        try:
            df = pandas.read_csv(blasted_lst[i] ,sep="\t", header=None)
        except:
            df = pandas.DataFrame(columns=blast_cols)
        df.columns=blast_cols
        recruited_mg.append(df)
        
#    print len(recruited_mg[0])
#    print len(recruited_mg[1])

    #creating all_records entry
#! Remember to close index objects after they are no longer needed
#! Use helper function close_ind_lst()
    all_records = []
    all_input_recs = parse_contigs_ind(ref_out_0)
    
    ##calculating GC of the reference
#    if (len(all_input_recs)>1):
#TODO: make a better adaptation
    if False:    # I'm adapting the script for blastn
        pass
 #       ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()])
 #       ref_cnt = ref_gc_lst.size
 #       ref_gc_avg = np.mean(ref_gc_lst)
 #       ref_gc_avg_std = np.std(ref_gc_lst)
 #       if(len(ref_gc_lst) > 0):
 #           ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0)
 #       else:
 #           ref_gc_avg_sem=0

    else:
        if (debugging):
            print "Only one reference"
        ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()])
        ref_cnt = ref_gc_lst.size
        ref_gc_avg = np.mean(ref_gc_lst)
        ref_gc_avg_std=0
        ref_gc_avg_sem=0
    #ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0)
    
#    _ = 0
#    for key, value in all_input_recs.items():
#        _ +=1
#        if _ < 20:
#            print key, len(value)
    
    
    print "\nIndexing metagenome file(s):"
    for i in range(len(mg_lst)):
        start = time.time()
        all_records.append(parse_contigs_ind(mg_lst[i]))
        print "\t"+mg_lst[i]+" Indexed in : "+str(time.time()-start)+" seconds."

# Transforming data
    print "\nParsing recruited contigs:"
    for i in range(len(mg_lst)):
        start = time.time()
    #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe])
        recruited_mg[i]=unique_scaffold_topBits(recruited_mg[i])
        contig_list = recruited_mg[i]['quid'].tolist()

        #this should solve string/int fastaID problem, until now fixed with renaming
        contig_list = list(map(str, contig_list))

        recruited_mg[i]['Contig_nt']=retrive_sequence(contig_list, all_records[i])
        recruited_mg[i]['Contig_size']=recruited_mg[i]['Contig_nt'].apply(lambda x: len(x))
        #recruited_mg[i]['Ref_nt']=recruited_mg[i]['suid'].apply(lambda x: all_input_recs[str(x)].seq)
        recruited_mg[i]['Ref_size']=recruited_mg[i]['suid'].apply(lambda x: len(all_input_recs[str(x)]))
        #TODO: make a better adaptation
        recruited_mg[i]['Ref_GC']=0.0
        #recruited_mg[i]['Ref_GC']=recruited_mg[i]['suid'].apply(lambda x: GC(all_input_recs[str(x)].seq))
        #recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/min(recruited_mg[i]['Contig_size'].apply(lambda y: y),recruited_mg[i]['Ref_size'].apply(lambda z: z))
        #df.loc[:, ['B0', 'B1', 'B2']].min(axis=1)
        recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/recruited_mg[i].loc[:,["Contig_size", "Ref_size"]].min(axis=1)
        recruited_mg[i]['Metric']=recruited_mg[i]['Coverage']*recruited_mg[i]['iden']/100.0
        try:
            recruited_mg[i]['Contig_GC']=recruited_mg[i]['Contig_nt'].apply(lambda x: GC(x))
        except:
            recruited_mg[i]['Contig_GC']=recruited_mg[i]['Contig_nt'].apply(lambda x: None)
        try:
            recruited_mg[i]['Read_RPKM']=1.0/((recruited_mg[i]['Ref_size']/1000.0)*(len(all_records[i])/1000000.0))
        except:
            recruited_mg[i]['Read_RPKM']=np.nan
        
        #recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Ref_nt','Contig_size','Contig_GC','Contig_nt']]
        recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Contig_size','Contig_GC','Read_RPKM','Contig_nt']]
        print "\tContigs from "+mg_lst[i]+" parsed in : "+str(time.time()-start)+" seconds."
   
# Here would go statistics functions and producing plots
#
#
#
#
#

# Quality filtering before outputting
    if alen_percent:
        for i in range(len(recruited_mg)):
            recruited_mg[i]=recruited_mg[i][(recruited_mg[i]['iden']>=iden)&(recruited_mg[i]['Coverage']>=alen)&(recruited_mg[i]['eval']<=e_val)]
    if alen_bp:
        for i in range(len(recruited_mg)):
            recruited_mg[i]=recruited_mg[i][(recruited_mg[i]['iden']>=iden)&(recruited_mg[i]['alen']>=alen)&(recruited_mg[i]['eval']<=e_val)]
                

#    print  len(recruited_mg[0])
#    print len(recruited_mg[1])

# Batch export to outfmt (csv and/or multiple FASTA)
    alen_str = ""
    iden_str = "_iden_"+str(iden)+"%"
    if alen_percent:
        alen_str = "_alen_"+str(alen)+"%"
    if alen_bp:
        alen_str = "_alen_"+str(alen)+"bp"

    if iterations > 1:
        prefix=name+"/results/"+name.split("/")[0]+"_iter_e_"+str(e_val)+iden_str+alen_str
    else:
        prefix=name+"/results/"+name.split("/")[0]+"_e_"+str(e_val)+iden_str+alen_str
        
    if sheared:
        prefix = prefix+'_sheared_'+str(shear_val)+"bp"
        
    prefix = prefix + "_recruited_mg_"

#initializing log file data

    logfile=name.split("/")[0]+"/results_log.csv"
    try:
        run = int(name.split("/")[-1].split("_")[-1])# using "_" less depends on the wrapper script
    except:
        if name.split("/")[-1].split("_")[-1]==name:
            run = 0
        else:
            print "Warning: Run identifier could not be written in: "+logfile
            #sys.exit(1)
            run = None
    alen_header = "Min alen"
    if alen_bp:
        alen_header = alen_header+" (bp)"
    if alen_percent:
        alen_header = alen_header+" (%)"
        
    shear_header = "Reference Shear (bp)"
    shear_log_value = 0
    if sheared:
        shear_log_value = str(shear_val)
        
    
    print "\nWriting files:"

    for i in range(len(mg_lst)):
        records= []
        if "csv" in fmt_lst:
            outfile1 = prefix+str(i)+".csv"
            recruited_mg[i].to_csv(outfile1, sep='\t')
            print str(len(recruited_mg[i]))+" sequences written to "+outfile1
        if "fasta" in fmt_lst:
            ids = recruited_mg[i]['quid'].tolist()

            # fixing the renaming error, converting to list of string
            ids = list(map(str, ids))

            #if len(ids)==len(sequences):
            for j in range(len(ids)):
                records.append(all_records[i][ids[j]])
            outfile2 = prefix+str(i)+".fasta" 
            with open(outfile2, "w") as output_handle:

                #SeqIO.write(records, output_handle, "fasta")
                #this should not have line wrappings
                SeqIO.write(records, output_handle, "fasta-2line")
            print str(len(ids))+" sequences written to "+outfile2
            
#Writing logfile
        
        try:
            time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        except:
            print "Warning: Time identifier could not be written in: "+logfile
        metagenome = mg_lst[i]
        #contig info
        
        rpkm_lst = np.array(recruited_mg[i]['Read_RPKM'].tolist())
        if(len(rpkm_lst) > 0):
            rpkm = np.sum(rpkm_lst)
            rpkm_std= np.std(rpkm_lst)
            rpkm_sem = np.std(rpkm_lst)*np.sqrt(len(rpkm_lst))

        else:
            rpkm = 0
            rpkm_std= 0
            rpkm_sem=0
        
        
        sizes_lst = np.array(recruited_mg[i]['Contig_size'].tolist())
        if(len(sizes_lst) > 0):
            sizes_avg = np.mean(sizes_lst)
            sizes_avg_std= np.std(sizes_lst)
            if(len(sizes_lst) > 1):
                sizes_avg_sem = stats.sem(sizes_lst, axis=0)
            else:
                sizes_avg_sem = 0
        else:
            sizes_avg = 0
            sizes_avg_std= 0
            sizes_avg_sem=0
        #sizes_avg_sem = stats.sem(sizes_lst, axis=0)
        
        alen_lst = np.array(recruited_mg[i]['alen'].tolist())
        if(len(alen_lst) > 0):
            alen_avg = np.mean(alen_lst)
            alen_avg_std = np.std(alen_lst)
            if(len(alen_lst) > 1):
                alen_avg_sem = stats.sem(alen_lst, axis=0)
            else:
                alen_avg_sem = 0
        else:
            alen_avg = 0
            alen_avg_std = 0
            alen_avg_sem=0
        #alen_avg_sem = stats.sem(alen_lst, axis=0)
        
        iden_lst = np.array(recruited_mg[i]['iden'].tolist())
        if(len(iden_lst) > 0):
            iden_avg = np.mean(iden_lst)
            iden_avg_std = np.std(iden_lst)
            if(len(iden_lst) > 1):
                iden_avg_sem = stats.sem(iden_lst, axis=0)
            else:
                iden_avg_sem = 0
        else:
            iden_avg = 0
            iden_avg_std = 0
            iden_avg_sem=0
        #iden_avg_sem = stats.sem(iden_lst, axis=0)

        gc_lst = np.array(recruited_mg[i]['Contig_GC'].tolist())
        if(len(gc_lst) > 0):
            gc_avg = np.mean(gc_lst)
            gc_avg_std = np.std(gc_lst)
            if(len(gc_lst) > 1):
                gc_avg_sem = stats.sem(gc_lst, axis=0)
            else:
                gc_avg_sem = 0
        else:
            gc_avg = 0
            gc_avg_std = 0
            gc_avg_sem=0
        
        if ref_cnt > 0:
            recr_percent = float(len(ids))/float(len(all_records[i]))*100
        else:
            recr_percent = 0.0


        
        #log_header = ['Run','Project Name','Created', 'Reference(s)','Metagenome', 'No. Contigs','No. References', alen_header, "Min iden (%)", shear_header, "Mean Contig Size (bp)","STD Contig Size", "SEM Contig Size", "Mean Contig alen (bp)","STD Contig alen", "SEM Contig alen", "Mean Contig iden (bp)","STD Contig iden", "SEM Contig iden", "Mean Contig GC (%)","STD Contig GC","SEM Contig GC","Mean Reference GC (%)","STD Reference GC","SEM Reference GC"]
        log_header = ['Run','Project Name','Created', 'Reference(s)', shear_header,'No. Ref. Sequences','Metagenome','No. Metagenome Contigs' , alen_header, "Min iden (%)",'No. Recruited Contigs','% Recruited Contigs', 'Total RPKM', 'RPKM STD', 'RPKM SEM', "Mean Rec. Contig Size (bp)","STD Rec. Contig Size", "SEM Rec. Contig Size", "Mean alen (bp)","STD alen", "SEM alen", "Mean Rec. Contig iden (bp)","STD Rec. Contig iden", "SEM Rec. Contig iden", "Mean Rec. Contigs GC (%)","STD Rec. Contig GC","SEM Rec. Contig GC","Mean Total Reference(s) GC (%)","STD Total Reference(s) GC","SEM Total Reference(s) GC"]
        #log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), metagenome, len(ids),ref_cnt, alen, iden, shear_log_value, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem]
        log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), shear_log_value,ref_cnt, metagenome,len(all_records[i]) , alen, iden,len(ids),recr_percent,rpkm, rpkm_std, rpkm_sem, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem]    
        if os.path.isfile(logfile):#file exists - appending
            with open(logfile, "a") as log_handle:
                log_writer = csv.writer(log_handle, delimiter='\t')
                log_writer.writerow(log_row)
        else:#no file exists - writing
            with open(logfile,"w") as log_handle:
                log_writer = csv.writer(log_handle, delimiter='\t')
                log_writer.writerow(log_header)
                log_writer.writerow(log_row)
            
            
    close_ind_lst(all_records)
    close_ind_lst([all_input_recs])