コード例 #1
0
def funLocalBlast(sFastaFileName, sGBKFileName, dbName):
    """Import packages used """
    from Bio.Blast.Applications import NcbiblastnCommandline
    from Bio import SeqIO
    from Bio.SeqUtils import GC
    import subprocess
    import xlsxwriter
    from funReadBlast import funReadBlast
    from funANICalc import funANICalc
    from funBlastANI2XLS import funBlastANI2XLS

    #sFastaFileName = '/Users/yi.yan/Documents/FDA-ARGOS/Batch6/PFDA1_Batch6_Misidentified_Contaminated/CNH_804.fasta'
    #sGBKFileName = "TestFolderGenBank/AMERTCC_44.annotation.20161209.gbk"
    #sGBKFileName = 'N/A'
    #dbName = "ref_prok_rep_genomes"

    columnTitleRow = [
        "FDAARGOS_ID",  #0
        "Num_Contig",  #1 
        "Assembly_Size",  #2
        "N_50",  #3
        "Largest_Contig_Size",  #4
        "Contig_ID",  #5
        "Contig_Length",  #6
        "Contig_GC",  #7
        "Proposed Organism",  #8
        "Blast_Hit",  #9
        "ACCESSION",  #10
        "Score",  #11
        "Percent_Query_Identity",  #12
        "Percent_Query_Coverage",  #13
        "Scientific_Name",  #14
        "Query_ANI_Coverage",  #15
        "Subject_ANI_Coverage",  #16
        "Query_ANI_Length",  #17
        "Subject_ANI_Length",  #18
        "Query_ANI_HD",  #19
        "Subject_ANI_HD",  #20
        "Query_ANI_Identity",  #21
        "Subject_ANI_Identity",  #22
        "Query_ANI_SE",  #23
        "Subject_ANI_SE"
    ]

    sFileName = sFastaFileName[0:-6] + '.xlsx'

    lARGOSID = sFastaFileName.split("/")
    sARGOSID = lARGOSID[-1][0:-6]
    """Import Fasta sequence from assembly file"""

    lSeqRecord = []
    for seq_record in SeqIO.parse(sFastaFileName, "fasta"):
        lSeqRecord.append(seq_record)
    """Import Annotation"""
    all_species = []

    if sGBKFileName == "N/A":
        for seq_record in lSeqRecord:
            all_species.append('N/A-N/A')
    else:
        f = open(sGBKFileName, 'r', errors='ignore')
        for line in f:
            if "ORGANISM" in line:
                print(line)
                sSpecie = line
                all_species.append(sSpecie)
        f.close
    """Calculate Contig Statistics"""
    lSize = []
    lGC = []

    for seq_record in lSeqRecord:
        lSize.append(len(seq_record.seq))
        lGC.append(GC(seq_record.seq))

    nTotalAssemblySize = sum(lSize)
    nNumContig = len(lSize)
    nLargestContig = max(lSize)

    #nTotalGC = np.multiply(lGC,lSize)
    #nTotalPercentGC = sum(nTotalGC)/nTotalAssemblySize

    nThreshold = 0.5 * nTotalAssemblySize
    lTempSize = sorted(lSize, reverse=True)

    nSize = 0
    count = 0

    while nSize <= nThreshold:
        nSize = nSize + lTempSize[count]
        out = count
        count = count + 1

    nN50 = lTempSize[out]

    #Run Blast

    sOutFileName = sARGOSID + ".txt"


    blastn_cline = NcbiblastnCommandline(task = "megablast", \
                                         query = sFastaFileName, \
                                         db = dbName,\
                                         evalue = 0.001, \
                                         max_target_seqs = 5, \
                                         outfmt = "\"6 " +\
                                         "qseqid "+\
                                         "qlen "+\
                                         "sscinames "+\
                                         "sacc "+\
                                         "stitle "+\
                                         "length "+\
                                         "score "+\
                                         "pident "+\
                                         "qcovs\"",
                                         out = sOutFileName)

    print('run Refseq Blast')
    process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\
                               +"&&/usr/local/ncbi/blast/bin/"\
                               +str(blastn_cline),\
                               shell=True,\
                               stdout = subprocess.PIPE,\
                               stderr = subprocess.PIPE)
    proc_out, proc_err = process.communicate()


    tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\
                 nN50,nLargestContig,lGC,lSize)
    #Run ANI

    print('Calculating ANI')
    FinalTbl = funANICalc(tblComplete, lSeqRecord, dbName)
    s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True)
    workbook = xlsxwriter.Workbook(sFileName)
    funBlastANI2XLS(workbook, s, dbName, columnTitleRow)
    #BLAST NT

    sOutFileName = sARGOSID + ".txt"


    blastn_cline = NcbiblastnCommandline(task = "megablast", \
                                         query = sFastaFileName, \
                                         db = "nt",\
                                         evalue = 0.001, \
                                         max_target_seqs = 5, \
                                         outfmt = "\"6 " +\
                                         "qseqid "+\
                                         "qlen "+\
                                         "sscinames "+\
                                         "sacc "+\
                                         "stitle "+\
                                         "length "+\
                                         "score "+\
                                         "pident "+\
                                         "qcovs\"",
                                         out = sOutFileName)

    print('Run BLAST NT')
    process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\
                               +"&&/usr/local/ncbi/blast/bin/"\
                               +str(blastn_cline),\
                               shell=True,\
                               stdout = subprocess.PIPE,\
                               stderr = subprocess.PIPE)
    proc_out, proc_err = process.communicate()


    tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\
                 nN50,nLargestContig,lGC,lSize)
    #Run ANI

    print('Calculating ANI')
    FinalTbl = funANICalc(tblComplete, lSeqRecord, "nt")
    s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True)
    funBlastANI2XLS(workbook, s, 'NT', columnTitleRow)

    workbook.close()
コード例 #2
0
                        required=True,
                        help="Digite o nome do output em formato fasta",
                        type=str)
    parser.add_argument('-G',
                        '--outputGc',
                        dest="gc_file",
                        required=True,
                        help="Digite a saide do gc",
                        type=str)

    args = parser.parse_args()

    with open(args.fastq_file, 'r') as fastqhandle, open(args.gc_file,
                                                         'w') as gchandle:
        for record in SeqIO.parse(fastqhandle, "fastq"):
            sequences[record.id] = GC(record.seq)
            gchandle.write(f'{record.id}\toriginal\t{sequences[record.id]}\n')
    with open(args.fastq_file2, 'r') as fastqhandle2, open(args.gc_file,
                                                           'a') as gchandle2:
        for record in SeqIO.parse(fastqhandle2, "fastq"):
            sequences_mutado[record.id] = GC(record.seq)
            gchandle2.write(
                f'{record.id}\tmutado\t{sequences_mutado[record.id]}\n')

    figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    ax1.hist(
        sequences.values(),
        bins=1000,
        label='original',
        color='green',
    )
コード例 #3
0
# 3. 统计基因和基因间区不同信息的分布情况
# --------------------------------------------
'''
print(step3)

levelTwoType = []

geneDist = os.path.join(outdir, "gene.dist.tsv")
with open(geneDist, 'w') as f:
    f.write("id\ttranscript_num\tgc\tgc1\tgc2\tgc3\tlength\n")
    for gene in db.features_of_type("gene"):
        transcriptCounts = str(len(list(db.children(gene))))
        transcriptType = [t.featuretype for t in db.children(gene, level=1)]
        levelTwoType += transcriptType
        geneFa = gene.sequence(fasta)
        gc = GC(geneFa)
        gc123 = GC123(geneFa)
        geneLen = gene.end - gene.start + 1
        items = [
            gene.id, transcriptCounts,
            str(gc),
            str(gc123[1]),
            str(gc123[2]),
            str(gc123[3]),
            str(geneLen)
        ]
        linestr = '\t'.join(items)
        f.write(linestr + '\n')

print(set(levelTwoType))
コード例 #4
0
    for codon in codonList:
        if codon in codonCntDict:
            codonCntDict[codon]+=1
        else:
            codonCntDict[codon]=1
    featureList=[]
    
#    for codon in codonOrder:
#        featureList.append(random.randint(0,100))

    for codon in codonOrder:
        if codon in codonCntDict:
            featureList.append(float(codonCntDict[codon]))
        else:
            featureList.append(0)
    gcList.append(GC(sequence))
#    featureList.append(GC(sequence))
    lengthList.append(len(sequence))
#    featureList.append(len(sequence))
    phi=methods.calPhiForGene(sequence)
#    featureList.append(phi)
#    featureList.append()
#    featureList.append(methods.calMForGene(sequence))
#    if phi<0.1:
#        continue


    phiList.append(phi)
    tAIList.append(calculateTAI.calculateOneGene(sequence,species))
#    CAIList.append(calculateCAI.calculateCAIForAGene(sequence))
    featureList=minmax_scale(featureList) # Here does the scale
コード例 #5
0
ファイル: circos_utils.py プロジェクト: thisisliuqing/TPutils
def contig2gc(records):
    from Bio.SeqUtils import GC
    contig2gc_dico = {}
    for record in records:
        contig2gc_dico[record.name] = GC(record.seq)
    return contig2gc_dico
コード例 #6
0
sum_len = tot_dup_len + tot_non_dup_len

print("Total duplicated region length: %d " % tot_dup_len)
print("Maximum length of duplicated region: %d" % (max(dup_len_l)))
print("Minimum length of duplicated region: %d" % (min(dup_len_l)))
print("Median length of duplicated region: %d" % (numpy.median(dup_len_l)))

print("Total non-duplicated region length: %d " % tot_non_dup_len)
print("Sum of duplicated + non-duplicated region length: %d " % sum_len)
print("Total genome chr length: %d" % tot_chr_len)
print("Percent of duplicated regions: %.2f%% " %
      (tot_dup_len / float(tot_chr_len) * 100))
print("%d within chromosome matches; %d across chromosome matches" %
      (nr_within_chr, nr_across_chr))

mean_GC = GC("".join(map(str, seq_d.values())))
print("Whole genome GC %.2f; Avg GC of duplicated genes %.2f; Avg GC of non-duplicated genes %.2f; \
Avg GC3 of duplicated genes %.2f; Avg GC3 of non-duplicated genes %.2f " \
       %(mean_GC, numpy.mean(dup_gene_gc_l), numpy.mean(non_dup_gene_gc_l), \
         numpy.mean(dup_gene_gc3_l), numpy.mean(non_dup_gene_gc3_l)))
print("Median GC of duplicated genes %.2f; Median GC of non-duplicated genes %.2f; \
Median GC3 of duplicated genes %.2f; Median GC3 of non-duplicated genes %.2f " \
       %(numpy.median(dup_gene_gc_l), numpy.median(non_dup_gene_gc_l), \
         numpy.median(dup_gene_gc3_l), numpy.median(non_dup_gene_gc3_l)))

mean_dup_GC = GC("".join(map(str, dup_seq_l)))
mean_non_dup_GC = GC("".join(map(str, non_dup_seq_l)))
print("Avg GC of duplicated regions %.2f; Avg GC of non-duplicated regions %.2f " \
       %(mean_dup_GC, mean_non_dup_GC))

non_dup_gc_l = FloatVector(non_dup_gc_l).r_repr()
コード例 #7
0
def main(argv):

    #default parameters
    mg_lst = []
    ref_lst = []
    e_val = 1e-5
    alen = 50.0
    alen_percent = True
    alen_bp = False
    iden = 95.0
    name = "output"
    fmt_lst = ["fasta"]
    supported_formats = ["fasta", "csv"]
    iterations = 1
    alen_increment = 5.0
    iden_increment = 0.0
    blast_db_Dir = ""
    results_Dir = ""
    input_files_Dir = ""
    ref_out_0 = ""
    blasted_lst = []
    continue_from_previous = False  #poorly supported, just keeping the directories
    skip_blasting = False
    debugging = False
    sheared = False
    shear_val = None
    logfile = ""

    try:
        opts, args = getopt.getopt(argv, "r:m:n:e:a:i:s:f:h", [
            "reference=", "metagenome=", "name=", "e_value=",
            "alignment_length=", "identity=", "shear=", "format=",
            "iterations=", "alen_increment=", "iden_increment=",
            "continue_from_previous", "skip_blasting", "debugging", "help"
        ])
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
#        elif opt in ("--recover_after_failure"):
#            recover_after_failure = True
#            print "Recover after failure:", recover_after_failure

        elif opt in ("--continue_from_previous"):
            continue_from_previous = True
            if debugging:
                print "Continue after failure:", continue_from_previous
        elif opt in ("--debugging"):
            debugging = True
            if debugging:
                print "Debugging messages:", debugging

        elif opt in ("-r", "--reference"):
            if arg:
                ref_lst = arg.split(',')
                #infiles = arg
            if debugging:
                print "Reference file(s)", ref_lst
        elif opt in ("-m", "--metagenome"):
            if arg:
                mg_lst = arg.split(',')
                #infiles = arg
            if debugging:
                print "Metagenome file(s)", mg_lst

        elif opt in ("-f", "--format"):
            if arg:
                fmt_lst = arg.split(',')
                #infiles = arg
            if debugging:
                print "Output format(s)", fmt_lst

        elif opt in ("-n", "--name"):
            if arg.strip():
                name = arg
            if debugging:
                print "Project name", name

        elif opt in ("-e", "--e_value"):
            try:
                e_val = float(arg)
            except:
                print "\nERROR: Please enter numerical value as -e parameter (default: 1e-5)"
                usage()
                sys.exit(1)
            if debugging:
                print "E value", e_val

        elif opt in ("-a", "--alignment_length"):
            if arg.strip()[-1] == "%":
                alen_bp = False
                alen_percent = True
            else:
                alen_bp = True
                alen_percent = False

            try:
                alen = float(arg.split("%")[0])
            except:
                print "\nERROR: Please enter a numerical value as -a parameter (default: 50.0)"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", alen

        elif opt in ("-i", "--identity"):
            try:
                iden = float(arg)
            except:
                print "\nERROR: Please enter a numerical value as -i parameter (default: 95.0)"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", iden
        elif opt in ("-s", "--shear"):
            sheared = True
            try:
                shear_val = int(arg)
            except:
                print "\nERROR: Please enter an integer value as -s parameter"
                usage()
                sys.exit(1)
            if debugging:
                print "Alignment length", iden
        elif opt in ("--iterations"):
            try:
                iterations = int(arg)
            except:

                print "\nWARNING: Please enter integer value as --iterations parameter (using default: 1)"
            if debugging:
                print "Iterations: ", iterations

        elif opt in ("--alen_increment"):

            try:
                alen_increment = float(arg)
            except:
                print "\nWARNING: Please enter numerical value as --alen_increment parameter (using default: )", alen_increment
            if debugging:
                print "Alignment length increment: ", alen_increment

        elif opt in ("--iden_increment"):

            try:
                iden_increment = float(arg)
            except:
                print "\nWARNING: Please enter numerical value as --iden_increment parameter (using default: )", iden_increment
            if debugging:
                print "Alignment length increment: ", iden_increment
        elif opt in ("--skip_blasting"):
            skip_blasting = True
            if debugging:
                print "Blasting step omitted; Using previous blast output."

    for ref_file in [x for x in ref_lst if x]:
        try:
            #
            with open(ref_file, "rU") as hand_ref:
                pass
        except:
            print "\nERROR: Reference File(s) [" + ref_file + "] doesn't exist"
            usage()
            sys.exit(1)

    for mg_file in [x for x in mg_lst if x]:
        try:
            #
            with open(mg_file, "rU") as hand_mg:
                pass
        except:
            print "\nERROR: Metagenome File(s) [" + mg_file + "] doesn't exist"
            usage()
            sys.exit(1)

    for fmt in [x for x in fmt_lst if x]:
        if fmt not in supported_formats:
            print "\nWARNING: Output format [", fmt, "] is not supported"
            print "\tUse -h(--help) option for the list of supported formats"
            fmt_lst = ["fasta"]
            print "\tUsing default output format: ", fmt_lst[0]

    project_dir = name
    if not continue_from_previous:
        if os.path.exists(project_dir):
            shutil.rmtree(project_dir)
        try:
            os.mkdir(project_dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + name
            raise

    print "\n\t Initial Parameters:"
    print "\nProject Name: ", name, '\n'
    print "Project Directory: ", os.path.abspath(name), '\n'
    print "Reference File(s): ", ref_lst, '\n'
    if sheared:
        print "Shear Reference File(s):", str(shear_val) + "bp", '\n'
    print "Metagenome File(s): ", mg_lst, '\n'
    print "E Value: ", e_val, "\n"
    if alen_percent:
        print "Alignment Length: " + str(alen) + '%\n'
    if alen_bp:
        print "Alignment Length: " + str(alen) + 'bp\n'
    print "Sequence Identity: " + str(iden) + '%\n'
    print "Output Format(s):", fmt_lst, '\n'
    if iterations > 1:
        print "Iterations: ", iterations, '\n'
        print "Alignment Length Increment: ", alen_increment, '\n'
        print "Sequence identity Increment: ", iden_increment, '\n'

    #Initializing directories
    blast_db_Dir = name + "/blast_db"
    if not continue_from_previous:
        if os.path.exists(blast_db_Dir):
            shutil.rmtree(blast_db_Dir)
        try:
            os.mkdir(blast_db_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + blast_db_Dir
            raise

    results_Dir = name + "/results"
    if not continue_from_previous:

        if os.path.exists(results_Dir):
            shutil.rmtree(results_Dir)
        try:
            os.mkdir(results_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + results_Dir
            raise

    input_files_Dir = name + "/input_files"
    if not continue_from_previous:

        if os.path.exists(input_files_Dir):
            shutil.rmtree(input_files_Dir)
        try:
            os.mkdir(input_files_Dir)
        except OSError:
            print "ERROR: Cannot create project directory: " + input_files_Dir
            raise

# Writing raw reference files into a specific input filename
    input_ref_records = {}
    for reference in ref_lst:
        ref_records_ind = parse_contigs_ind(reference)
        #ref_records = dict(ref_records_ind)
        input_ref_records.update(ref_records_ind)
        ref_records_ind.close()
        #input_ref_records.update(ref_records)

    ref_out_0 = input_files_Dir + "/reference0.fna"
    if (sheared & bool(shear_val)):
        with open(ref_out_0, "w") as handle:
            SeqIO.write(
                genome_shredder(input_ref_records, shear_val).values(), handle,
                "fasta")

            #NO NEED TO CLOSE with statement will automatically close the file
    else:
        with open(ref_out_0, "w") as handle:
            SeqIO.write(input_ref_records.values(), handle, "fasta")

# Making BLAST databases
#output fname from before used as input for blast database creation
    input_ref_0 = ref_out_0
    title_db = name + "_db"  #add iteration functionality
    outfile_db = blast_db_Dir + "/iteration" + str(
        iterations) + "/" + name + "_db"  #change into for loop
    os.system("makeblastdb -in " + input_ref_0 + " -dbtype prot -title " +
              title_db + " -out " + outfile_db + " -parse_seqids")

    # BLASTing query contigs
    if not skip_blasting:
        print "\nBLASTing query file(s):"
        for i in range(len(mg_lst)):

            database = outfile_db  # adjust for iterations
            blasted_lst.append(results_Dir + "/recruited_mg_" + str(i) +
                               ".tab")
            start = time.time()
            os_string = 'blastp -db ' + database + ' -query \"' + mg_lst[
                i] + '\" -out ' + blasted_lst[i] + " -evalue " + str(
                    e_val) + "  -outfmt 6 -num_threads 8"
            #print os_string
            os.system(os_string)
            print "\t" + mg_lst[i] + "; Time elapsed: " + str(
                time.time() - start) + " seconds."
    else:
        for i in range(len(mg_lst)):
            blasted_lst.append(results_Dir + "/recruited_mg_" + str(i) +
                               ".tab")

# Parsing BLAST outputs
    blast_cols = [
        'quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta',
        'send', 'eval', 'bits'
    ]
    recruited_mg = []
    for i in range(len(mg_lst)):
        try:
            df = pandas.read_csv(blasted_lst[i], sep="\t", header=None)
        except:
            df = pandas.DataFrame(columns=blast_cols)
        df.columns = blast_cols
        recruited_mg.append(df)

#    print len(recruited_mg[0])
#    print len(recruited_mg[1])

#creating all_records entry
#! Remember to close index objects after they are no longer needed
#! Use helper function close_ind_lst()
    all_records = []
    all_input_recs = parse_contigs_ind(ref_out_0)

    ##calculating GC of the reference
    #    if (len(all_input_recs)>1):
    #TODO: make a better adaptation
    if False:  # I'm adapting the script for blastn
        pass
#       ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()])
#       ref_cnt = ref_gc_lst.size
#       ref_gc_avg = np.mean(ref_gc_lst)
#       ref_gc_avg_std = np.std(ref_gc_lst)
#       if(len(ref_gc_lst) > 0):
#           ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0)
#       else:
#           ref_gc_avg_sem=0

    else:
        if (debugging):
            print "Only one reference"
        ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()])
        ref_cnt = ref_gc_lst.size
        ref_gc_avg = np.mean(ref_gc_lst)
        ref_gc_avg_std = 0
        ref_gc_avg_sem = 0
    #ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0)

#    _ = 0
#    for key, value in all_input_recs.items():
#        _ +=1
#        if _ < 20:
#            print key, len(value)

    print "\nIndexing metagenome file(s):"
    for i in range(len(mg_lst)):
        start = time.time()
        all_records.append(parse_contigs_ind(mg_lst[i]))
        print "\t" + mg_lst[i] + " Indexed in : " + str(time.time() -
                                                        start) + " seconds."

# Transforming data
    print "\nParsing recruited contigs:"
    for i in range(len(mg_lst)):
        start = time.time()
        #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe])
        recruited_mg[i] = unique_scaffold_topBits(recruited_mg[i])
        contig_list = recruited_mg[i]['quid'].tolist()

        #this should solve string/int fastaID problem, until now fixed with renaming
        contig_list = list(map(str, contig_list))

        recruited_mg[i]['Contig_nt'] = retrive_sequence(
            contig_list, all_records[i])
        recruited_mg[i]['Contig_size'] = recruited_mg[i]['Contig_nt'].apply(
            lambda x: len(x))
        #recruited_mg[i]['Ref_nt']=recruited_mg[i]['suid'].apply(lambda x: all_input_recs[str(x)].seq)
        recruited_mg[i]['Ref_size'] = recruited_mg[i]['suid'].apply(
            lambda x: len(all_input_recs[str(x)]))
        #TODO: make a better adaptation
        recruited_mg[i]['Ref_GC'] = 0.0
        #recruited_mg[i]['Ref_GC']=recruited_mg[i]['suid'].apply(lambda x: GC(all_input_recs[str(x)].seq))
        #recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/min(recruited_mg[i]['Contig_size'].apply(lambda y: y),recruited_mg[i]['Ref_size'].apply(lambda z: z))
        #df.loc[:, ['B0', 'B1', 'B2']].min(axis=1)
        recruited_mg[i]['Coverage'] = recruited_mg[i]['alen'].apply(
            lambda x: 100.0 * float(x)
        ) / recruited_mg[i].loc[:, ["Contig_size", "Ref_size"]].min(axis=1)
        recruited_mg[i]['Metric'] = recruited_mg[i]['Coverage'] * recruited_mg[
            i]['iden'] / 100.0
        try:
            recruited_mg[i]['Contig_GC'] = recruited_mg[i]['Contig_nt'].apply(
                lambda x: GC(x))
        except:
            recruited_mg[i]['Contig_GC'] = recruited_mg[i]['Contig_nt'].apply(
                lambda x: None)
        try:
            recruited_mg[i]['Read_RPKM'] = 1.0 / (
                (recruited_mg[i]['Ref_size'] / 1000.0) *
                (len(all_records[i]) / 1000000.0))
        except:
            recruited_mg[i]['Read_RPKM'] = np.nan

        #recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Ref_nt','Contig_size','Contig_GC','Contig_nt']]
        recruited_mg[i] = recruited_mg[i][[
            'quid', 'suid', 'iden', 'alen', 'Coverage', 'Metric', 'mism',
            'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits', 'Ref_size',
            'Ref_GC', 'Contig_size', 'Contig_GC', 'Read_RPKM', 'Contig_nt'
        ]]
        print "\tContigs from " + mg_lst[i] + " parsed in : " + str(
            time.time() - start) + " seconds."

# Here would go statistics functions and producing plots
#
#
#
#
#

# Quality filtering before outputting
    if alen_percent:
        for i in range(len(recruited_mg)):
            recruited_mg[i] = recruited_mg[i][
                (recruited_mg[i]['iden'] >= iden)
                & (recruited_mg[i]['Coverage'] >= alen) &
                (recruited_mg[i]['eval'] <= e_val)]
    if alen_bp:
        for i in range(len(recruited_mg)):
            recruited_mg[i] = recruited_mg[i][
                (recruited_mg[i]['iden'] >= iden)
                & (recruited_mg[i]['alen'] >= alen) &
                (recruited_mg[i]['eval'] <= e_val)]

#    print  len(recruited_mg[0])
#    print len(recruited_mg[1])

# Batch export to outfmt (csv and/or multiple FASTA)
    alen_str = ""
    iden_str = "_iden_" + str(iden) + "%"
    if alen_percent:
        alen_str = "_alen_" + str(alen) + "%"
    if alen_bp:
        alen_str = "_alen_" + str(alen) + "bp"

    if iterations > 1:
        prefix = name + "/results/" + name.split("/")[0] + "_iter_e_" + str(
            e_val) + iden_str + alen_str
    else:
        prefix = name + "/results/" + name.split("/")[0] + "_e_" + str(
            e_val) + iden_str + alen_str

    if sheared:
        prefix = prefix + '_sheared_' + str(shear_val) + "bp"

    prefix = prefix + "_recruited_mg_"

    #initializing log file data

    logfile = name.split("/")[0] + "/results_log.csv"
    try:
        run = int(name.split("/")[-1].split("_")
                  [-1])  # using "_" less depends on the wrapper script
    except:
        if name.split("/")[-1].split("_")[-1] == name:
            run = 0
        else:
            print "Warning: Run identifier could not be written in: " + logfile
            #sys.exit(1)
            run = None
    alen_header = "Min alen"
    if alen_bp:
        alen_header = alen_header + " (bp)"
    if alen_percent:
        alen_header = alen_header + " (%)"

    shear_header = "Reference Shear (bp)"
    shear_log_value = 0
    if sheared:
        shear_log_value = str(shear_val)

    print "\nWriting files:"

    for i in range(len(mg_lst)):
        records = []
        if "csv" in fmt_lst:
            outfile1 = prefix + str(i) + ".csv"
            recruited_mg[i].to_csv(outfile1, sep='\t')
            print str(len(
                recruited_mg[i])) + " sequences written to " + outfile1
        if "fasta" in fmt_lst:
            ids = recruited_mg[i]['quid'].tolist()

            # fixing the renaming error, converting to list of string
            ids = list(map(str, ids))

            #if len(ids)==len(sequences):
            for j in range(len(ids)):
                records.append(all_records[i][ids[j]])
            outfile2 = prefix + str(i) + ".fasta"
            with open(outfile2, "w") as output_handle:

                #SeqIO.write(records, output_handle, "fasta")
                #this should not have line wrappings
                SeqIO.write(records, output_handle, "fasta-2line")
            print str(len(ids)) + " sequences written to " + outfile2

#Writing logfile

        try:
            time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        except:
            print "Warning: Time identifier could not be written in: " + logfile
        metagenome = mg_lst[i]
        #contig info

        rpkm_lst = np.array(recruited_mg[i]['Read_RPKM'].tolist())
        if (len(rpkm_lst) > 0):
            rpkm = np.sum(rpkm_lst)
            rpkm_std = np.std(rpkm_lst)
            rpkm_sem = np.std(rpkm_lst) * np.sqrt(len(rpkm_lst))

        else:
            rpkm = 0
            rpkm_std = 0
            rpkm_sem = 0

        sizes_lst = np.array(recruited_mg[i]['Contig_size'].tolist())
        if (len(sizes_lst) > 0):
            sizes_avg = np.mean(sizes_lst)
            sizes_avg_std = np.std(sizes_lst)
            if (len(sizes_lst) > 1):
                sizes_avg_sem = stats.sem(sizes_lst, axis=0)
            else:
                sizes_avg_sem = 0
        else:
            sizes_avg = 0
            sizes_avg_std = 0
            sizes_avg_sem = 0
        #sizes_avg_sem = stats.sem(sizes_lst, axis=0)

        alen_lst = np.array(recruited_mg[i]['alen'].tolist())
        if (len(alen_lst) > 0):
            alen_avg = np.mean(alen_lst)
            alen_avg_std = np.std(alen_lst)
            if (len(alen_lst) > 1):
                alen_avg_sem = stats.sem(alen_lst, axis=0)
            else:
                alen_avg_sem = 0
        else:
            alen_avg = 0
            alen_avg_std = 0
            alen_avg_sem = 0
        #alen_avg_sem = stats.sem(alen_lst, axis=0)

        iden_lst = np.array(recruited_mg[i]['iden'].tolist())
        if (len(iden_lst) > 0):
            iden_avg = np.mean(iden_lst)
            iden_avg_std = np.std(iden_lst)
            if (len(iden_lst) > 1):
                iden_avg_sem = stats.sem(iden_lst, axis=0)
            else:
                iden_avg_sem = 0
        else:
            iden_avg = 0
            iden_avg_std = 0
            iden_avg_sem = 0
        #iden_avg_sem = stats.sem(iden_lst, axis=0)

        gc_lst = np.array(recruited_mg[i]['Contig_GC'].tolist())
        if (len(gc_lst) > 0):
            gc_avg = np.mean(gc_lst)
            gc_avg_std = np.std(gc_lst)
            if (len(gc_lst) > 1):
                gc_avg_sem = stats.sem(gc_lst, axis=0)
            else:
                gc_avg_sem = 0
        else:
            gc_avg = 0
            gc_avg_std = 0
            gc_avg_sem = 0

        if ref_cnt > 0:
            recr_percent = float(len(ids)) / float(len(all_records[i])) * 100
        else:
            recr_percent = 0.0

        #log_header = ['Run','Project Name','Created', 'Reference(s)','Metagenome', 'No. Contigs','No. References', alen_header, "Min iden (%)", shear_header, "Mean Contig Size (bp)","STD Contig Size", "SEM Contig Size", "Mean Contig alen (bp)","STD Contig alen", "SEM Contig alen", "Mean Contig iden (bp)","STD Contig iden", "SEM Contig iden", "Mean Contig GC (%)","STD Contig GC","SEM Contig GC","Mean Reference GC (%)","STD Reference GC","SEM Reference GC"]
        log_header = [
            'Run', 'Project Name', 'Created', 'Reference(s)', shear_header,
            'No. Ref. Sequences', 'Metagenome', 'No. Metagenome Contigs',
            alen_header, "Min iden (%)", 'No. Recruited Contigs',
            '% Recruited Contigs', 'Total RPKM', 'RPKM STD', 'RPKM SEM',
            "Mean Rec. Contig Size (bp)", "STD Rec. Contig Size",
            "SEM Rec. Contig Size", "Mean alen (bp)", "STD alen", "SEM alen",
            "Mean Rec. Contig iden (bp)", "STD Rec. Contig iden",
            "SEM Rec. Contig iden", "Mean Rec. Contigs GC (%)",
            "STD Rec. Contig GC", "SEM Rec. Contig GC",
            "Mean Total Reference(s) GC (%)", "STD Total Reference(s) GC",
            "SEM Total Reference(s) GC"
        ]
        #log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), metagenome, len(ids),ref_cnt, alen, iden, shear_log_value, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem]
        log_row = [
            run,
            name.split("/")[0], time_str, ";".join(ref_lst), shear_log_value,
            ref_cnt, metagenome,
            len(all_records[i]), alen, iden,
            len(ids), recr_percent, rpkm, rpkm_std, rpkm_sem, sizes_avg,
            sizes_avg_std, sizes_avg_sem, alen_avg, alen_avg_std, alen_avg_sem,
            iden_avg, iden_avg_std, iden_avg_sem, gc_avg, gc_avg_std,
            gc_avg_sem, ref_gc_avg, ref_gc_avg_std, ref_gc_avg_sem
        ]
        if os.path.isfile(logfile):  #file exists - appending
            with open(logfile, "a") as log_handle:
                log_writer = csv.writer(log_handle, delimiter='\t')
                log_writer.writerow(log_row)
        else:  #no file exists - writing
            with open(logfile, "w") as log_handle:
                log_writer = csv.writer(log_handle, delimiter='\t')
                log_writer.writerow(log_header)
                log_writer.writerow(log_row)

    close_ind_lst(all_records)
    close_ind_lst([all_input_recs])
コード例 #8
0
from Bio.SeqUtils import GC

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

usage = """
Usage: fasta_seq_gc_content_plot.py fastafile [fastafile...]
"""

if len(sys.argv) <= 1:
    print(usage)
    sys.exit(0)

gc = []

for file in sys.argv[1:]:
    if not os.path.exists(file):
        print("file not exists: %s" % file)
        sys.exit(0)

    with open(file + ".gc", 'w') as fh:
        for seq in SeqIO.parse(file, "fasta"):
            gccontent = GC(seq.seq)
            gc.append(gccontent)
            fh.write("%s\t%d\n" % (seq.id, gccontent))

mpl.rc("figure", figsize=(8, 4))
sns.distplot(gc)
plt.savefig(file + ".gc.png")
コード例 #9
0
 def calcularte_CG_content(self, peaks_fa_file):
     gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse(peaks_fa_file, "fasta"))
     self.gc_total = statistics.mean(gc_values)
     self.gc_total= 31
コード例 #10
0
ファイル: seq_c.py プロジェクト: CamilleTum/biopython
def get_GC_contents(seq):
    return GC(seq)
コード例 #11
0
from Bio import SeqIO
from Bio.SeqUtils import GC
import sys

if(len(sys.argv) < 2):
    print("Usage python gc.py fasta_file ")
else:
    for seq_record in SeqIO.parse(sys.argv[1], "fasta"):
        print("Sequence ID:  "+seq_record.id)
        print("Sequence :\n"+str(seq_record.seq))
        print("Sequence length "+str(len(seq_record))+"\n")
        print("GC content: "+str(GC(seq_record.seq))+"\n")
        print("A :"+str(100.00*seq_record.seq.count("A")/len(seq_record))+"%")
        print("T :"+str(100.00*seq_record.seq.count("T")/len(seq_record))+"%")
        print("G :"+str(100.00*seq_record.seq.count("G")/len(seq_record))+"%")
        print("C :"+str(100.00*seq_record.seq.count("C")/len(seq_record))+"%")
        print("\n")
        
コード例 #12
0
from Bio import SeqIO
from Bio.SeqUtils import GC

records = list(SeqIO.parse("rosalind_tree.txt", "fasta"))

max = 0
max_id = ''
for item in records:
    if (GC(item.seq))> max:
        max = GC(item.seq)
        max_id = item.id

print (max_id)
print (max)
コード例 #13
0
ファイル: fastaparser.py プロジェクト: robsyme/ANCHOR
            yield batch


if Trinity == True:
    outputfile1 = (prefix + '_ContigDescrp_with_GC.txt')
    outputfile2 = (prefix + '_Contig_Coumpound_List.txt')
    outputfile3 = (prefix + '_ContigDescrp.txt')
    with open(inputfile) as fasta_file:  # Will close handle cleanly
        identifier = []
        length = []
        description = []
        gccontent = []
        for title, sequence in SimpleFastaParser(fasta_file):
            identifier.append(title.split(None, 1)[0])  # First word is ID
            length.append(len(sequence))
            gccontent.append(GC(sequence))
            description.append(
                "No Description")  # Description is "No Description"
    #ContigDescrp = DataFrame(dict(subjectid = Series(identifier, name = 'subjectid'), subjectlength = Series(length, name = 'subjectlength'))).set_index(['subjectid'])
    ContigDescrp = DataFrame(
        dict(Contigid=Series(identifier, name='Contigid'),
             ContigLength=Series(length, name='ContigLength'),
             GCContent=Series(gccontent, name='GCContent'),
             Description=Series(description,
                                name='Description'))).set_index(['Contigid'])
    #print ContigDescrp
    ContigDescrp = ContigDescrp[["ContigLength", "Description", "GCContent"]]
    ContigDescrp.to_csv(outputfile1, sep='\t', index=True)
    ContigDescrp = ContigDescrp.drop('GCContent', 1)
    ContigDescrp.to_csv(outputfile3, sep='\t', index=True)
    #Getting another column : gene id
コード例 #14
0
search_and_retrieve_fasta("nucleotide", "Blossfeldia[orgn] and rpl16",
                          "blossfeldia_rpl16.fasta")
# Look at output file

### PARSING
from Bio import SeqIO
blossfeldia_rpl16_sequences = list(
    SeqIO.parse("blossfeldia_rpl16.fasta", "fasta"))
# Look at sequence list and blossfeldia_rpl16_sequences[0]

### SEQUENCE OBJECTS
_first_blossfeldia_rpl16_sequence = blossfeldia_rpl16_sequences[0].seq
first_blossfeldia_rpl16_sequence = blossfeldia_rpl16_sequences[0].seq
# GC %
from Bio.SeqUtils import GC
GC(first_blossfeldia_rpl16_sequence)

# DNA --> RNA --> DNA
first_blossfeldia_rpl16_sequence.transcribe()
first_blossfeldia_rpl16_sequence.back_transcribe()

# DNA Coding Strand --> Protein
first_blossfeldia_rpl16_sequence.translate()

### BLASTING
from Bio.Blast import NCBIWWW
from Bio import SeqIO
result_handle = NCBIWWW.qblast("blastn", "nt",
                               _first_blossfeldia_rpl16_sequence)
save_file = open("blast_search_on_first_blossfeldia_rpl16_sequence.xml", "w")
save_file.write(result_handle.read())
コード例 #15
0
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

# length
print(len(my_seq))
# first element
print(my_seq[22])
#last element
print(my_seq[-1])

print(my_seq.count("GC"))

from Bio.SeqUtils import GC
print(GC(my_seq))

#slicing
print(my_seq[1:4])
# starting from 0 with step 3
print(my_seq[1::2])
print(my_seq[1:6:2])
#reverse
print(my_seq[::-1])
print(my_seq[22:35])

my_seq2 = Seq("EVRNAK")
print(my_seq + my_seq2)
print(my_seq2 + my_seq)

list_of_seqs = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
コード例 #16
0
ファイル: create_markdown.py プロジェクト: mbnmbn00/fGAP
def get_stats(D_fasta, D_gff3):
    # Get stats
    D_stat = {}
    cds_lengths = []
    protein_lengths = []
    exon_lengths = []
    transcript_lengths = []
    intron_lengths = []
    num_introns = []
    num_exons = []
    num_spliced = 0
    single_exon_genes = 0
    total_genes = 0
    D_cds_seq = {}
    D_cds_coords = defaultdict(list)

    sorted_genes = sorted(
        D_gff3.items(), key=lambda x: (
            int(re.findall(r'\d+', x[0])[0]),
            x[1][0][1]
        )
    )

    for prot_id, tuples in sorted_genes:
        total_genes += 1
        tmp_prot_len = 0
        if len(tuples) > 1:
            num_spliced += 1

        cds_seq = ''
        for tup in tuples:
            scaffold, start, end, strand, phase = tup
            if strand == '+' and tup == tuples[0]:
                start = start + phase
            elif strand == '-' and tup == tuples[-1]:
                end = end - phase

            tmp_prot_len += end - start + 1
            exon_lengths.append(end - start + 1)
            # Get sequence
            cds_seq += str(D_fasta[scaffold][start - 1:end].seq)
            # Store in dictionary
            D_cds_coords[scaffold].append((start, end))

        if strand == '-':
            cds_seq = get_reverse_complement(cds_seq)

        D_cds_seq[prot_id] = cds_seq
        cds_length = tmp_prot_len
        cds_lengths.append(cds_length)
        protein_length = tmp_prot_len / 3
        protein_lengths.append(protein_length)
        transcript_length = int(tuples[-1][2]) - int(tuples[0][1]) + 1
        transcript_lengths.append(transcript_length)
        num_intron = len(tuples) - 1
        if num_intron > 0:
            intron_start = [x[2] for x in tuples[:-1]]
            intron_end = [x[1] for x in tuples[1:]]
            intron_length = [
                y - x - 1 for x, y in zip(intron_start, intron_end)
            ]
            intron_lengths += intron_length
            num_introns.append(len(tuples) - 1)
        else:
            intron_median = 0
            num_introns_median = 0
        num_exons.append(len(tuples))
        if len(tuples) == 1:
            single_exon_genes += 1

    intron_median = np.median(np.array(intron_lengths))
    intron_len_average = np.average(np.array(intron_lengths))
    num_introns_median = np.median(np.array(num_introns))
    exon_median = np.median(np.array(exon_lengths))
    exon_len_average = np.average(np.array(exon_lengths))
    cds_average = np.average(cds_lengths)
    cds_median = np.median(cds_lengths)
    protein_average = np.average(np.array(protein_lengths))
    protein_median = np.median(np.array(protein_lengths))
    transcript_median = np.median(np.array(transcript_lengths))
    transcript_average = np.average(np.array(transcript_lengths))
    num_exons_median = np.median(np.array(num_exons))

    # Guitar
    percent_splice = round(float(num_spliced) / total_genes * 100, 2)
    total_bases_lst = [len(str(x.seq)) for x in D_fasta.values()]
    total_bases = sum(total_bases_lst)
    gene_density = float(total_genes) / total_bases
    gene_density = gene_density * 1000000
    gene_density = round(gene_density, 2)

    # Get GC content of CDS seq
    full_cds_seq = ''.join(D_cds_seq.values())
    my_seq = Seq(full_cds_seq, IUPAC.unambiguous_dna)
    cds_gc_percent = GC(my_seq)
    # Percent coding
    coding_percent = float(len(full_cds_seq)) / total_bases
    coding_percent = coding_percent * 100
    coding_percent = round(coding_percent, 2)

    D_stat['Total genes'] = total_genes
    D_stat['Transcript length'] = (
        round(transcript_average, 1), transcript_median
    )
    D_stat['CDS length'] = (round(cds_average, 1), cds_median)
    D_stat['Protein length'] = (round(protein_average, 1), protein_median)
    D_stat['Exon length'] = (round(exon_len_average, 1), exon_median)
    D_stat['Intron length'] = (round(intron_len_average, 1), intron_median)
    D_stat['Spliced'] = (num_spliced, percent_splice)
    D_stat['Gene density'] = gene_density
    D_stat['Num introns'] = sum(num_introns)
    D_stat['Num introns per gene'] = num_introns_median
    D_stat['Num exons'] = sum(num_exons)
    D_stat['Num exons per gene'] = num_exons_median
    D_stat['Num single exon genes'] = single_exon_genes
    D_stat['Percent coding region'] = (len(full_cds_seq), coding_percent)
    D_stat['Coding region GC'] = round(cds_gc_percent, 2)

    return D_cds_coords, protein_lengths, D_stat
コード例 #17
0
def complete_tasks(full_seq, des, unique_key):
    file_details = st.radio("Details", ("Description", "Sequence"),
                            key=unique_key)

    #Show description and sequence in DNA Analysis section
    if file_details == "Description":
        st.write(des)
    elif file_details == "Sequence":
        st.write(full_seq)

    #Nucleotide occurances plot and color selector for the bars
    st.subheader("Plot Nucleotide Frequency")
    full_seq_freq = OrderedDict(Counter(full_seq))

    bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key)
    bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key)
    bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key)
    bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key)

    if st.button("Plot Frequency", key=unique_key):
        barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values())
        barlist[0].set_color(bar1_colour)
        barlist[1].set_color(bar2_colour)
        barlist[2].set_color(bar3_colour)
        barlist[3].set_color(bar4_colour)
        st.pyplot()

    st.subheader("Properties")

    #GC Content, GC Melting temp, GC_skew, Complement and reverse complement
    gc_count = GC(full_seq)
    st.write("GC Content: {}".format(gc_count))

    mt = MeltingTemp.Tm_GC(full_seq, strict=False)
    st.write("Melting Temperature based on GC Content: {}".format(mt))

    gc_skew_bases = st.number_input("Enter number of bases", key=unique_key)
    try:
        gc_skew = GC_skew(full_seq, int(gc_skew_bases))
        st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew))
    except ValueError:
        st.write("Enter a Valid Number for bases")

    if st.checkbox("Complement", key=unique_key):
        st.write(full_seq.complement())

    elif st.checkbox("Reverse Complement", key=unique_key):
        st.write(full_seq.reverse_complement())

    #Protein Synthesis
    st.subheader("Protein Synthesis")
    p1 = full_seq.translate()
    if st.checkbox("Transcription: DNA to mRNA", key=unique_key):
        st.write(full_seq.transcribe())

    elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence",
                     key=unique_key):
        st.write(p1)

    elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence",
                     key=unique_key):
        full_aa_name = str(p1).replace("*", "")
        st.write(seq3(full_aa_name))

    elif st.checkbox("Plot Amino Acid Frequency", key=unique_key):
        aa_freq = OrderedDict(Counter(str(p1)))
        bar_colour = st.beta_color_picker("Pick Colour for all Bars",
                                          key=unique_key)
        plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour)
        st.pyplot()
        st.write("Asterisk (*) - Denotes Stop Codons.")
コード例 #18
0
import pylab

# Parte 1 - Abrindo arquivo GBK
for i in SeqIO.parse("NC_017108.gbk", "genbank"):
    seq = str(i.seq)

# Parte 2 - Variaveis importantes
tamanho = len(seq)
fragmentos = int(tamanho / 10000)
gc = []

# Parte 3 - Armazenando conteudo GC
for i in range(fragmentos):
    j = i * 10000
    k = j + 9999
    gc_atual = GC(seq[j:k])
    gc.append(gc_atual)
    print(i,": ",j,"-",k,"- GC =",gc_atual,"%")

# Parte 4 - Adicionando o ultimo elemento
resto = tamanho % 10000
j = (i+1) * 10000
k = j + resto
gc_ultimo = GC(seq[j:k])
gc.append(gc_ultimo)
print(i+1,": ",j,"-",k,"- GC =",gc_ultimo,"%")

# Parte 5 - Imprimindo grafico
pylab.plot(gc)
pylab.title("Conteudo GC\n%i fragmentos de 10000 pb variando de %0.1f%% \
    a %0.1f%%" % (len(gc),min(gc),max(gc)))
コード例 #19
0
def subsampleGC(modelfasta, subsamplefasta):
    #modelbins and subsamplebins are dictionaries where key is bin (e.g. '48_to_50') and value is list of IDs
    modelbins = {}
    subsamplebins = {}
    subsampledIDs = []
    subsampledfasta = []
    modelfastarecords = 0
    subsamplefastarecords = 0

    #Populate modelbins
    for record in SeqIO.parse(modelfasta, 'fasta'):
        lowerbound = 20
        upperbound = 22
        GCcontent = float(GC(record.seq))
        modelfastarecords +=1

        while upperbound <= 70:
            if GCcontent >= lowerbound and GCcontent < upperbound:
                if '%s_to_%s' % (lowerbound, upperbound) in modelbins:
                    modelbins['%s_to_%s' % (lowerbound, upperbound)].append(record.id)
                    break
                
                else:
                    modelbins['%s_to_%s' % (lowerbound, upperbound)] = [record.id]
                    break

            else:
                lowerbound +=2
                upperbound +=2

    #Populate subsamplebins
    for record in SeqIO.parse(subsamplefasta, 'fasta'):
        lowerbound = 20
        upperbound = 22
        GCcontent = float(GC(record.seq))
        subsamplefastarecords +=1

        while upperbound <= 70:
            if GCcontent >= lowerbound and GCcontent < upperbound:
                if '%s_to_%s' % (lowerbound, upperbound) in subsamplebins:
                    subsamplebins['%s_to_%s' % (lowerbound, upperbound)].append(record.id)
                    break
                
                else:
                    subsamplebins['%s_to_%s' % (lowerbound, upperbound)] = [record.id]
                    break

            else:
                lowerbound +=2
                upperbound +=2

    #Number of records in each fasta file...used for calculating density
    modelfastarecords = float(modelfastarecords)
    subsamplefastarecords = float(subsamplefastarecords)
    
    for modelbin in modelbins:
        modelbinpop = float(len(modelbins[modelbin]))
        modelbindens = float((len(modelbins[modelbin]) / modelfastarecords))
        #Number of records to pick is density of that bin in modelfasta * number of records in subsamplefasta
        subsample_records_to_pick = int(round(modelbindens * subsamplefastarecords))

        if modelbin in subsamplebins:
            subsamplebinpop = float(len(subsamplebins[modelbin]))
            
            if subsamplebinpop > subsample_records_to_pick:
                #pick random records
                random_subsampled_IDs = random.sample(subsamplebins[modelbin], subsample_records_to_pick)
                subsampledIDs += random_subsampled_IDs
            elif subsamplebinpop <= subsample_records_to_pick:
                #pick all records
                subsampledIDs += subsamplebins[modelbin]

    #Reassemble fasta from chosen IDs
    for record in SeqIO.parse(subsamplefasta, 'fasta'):
       if record.id in subsampledIDs:
           subsampledfasta.append(['>' + str(record.id), record.seq])

    print 'There were %i records in the model fasta and %i in the fasta to be subsampled.  %i records were chosen in the sampling.' % (modelfastarecords, subsamplefastarecords, len(subsampledIDs))

    #Return a list of fasta records.  Each record is itself a list where the first item is the ID and the second is the sequence
    return subsampledfasta
コード例 #20
0
def gc_extract():
    pos_neg_files = ['phycodnaviridae_virus_name.txt', 'phage_name.txt']
    data_path = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\'
    gc_list = []
    num_gene_list = []
    gene_len_list = []
    name_list = []
    eg_list = ['pos', 'neg']
    eg_num = 0
    for example in pos_neg_files:
        prepend_str = eg_list[eg_num]
        example_path = data_path + example
        f = open(example_path, 'r')
        contig_num = 0
        for dir in f.readlines():
            #pvt_list = []
            no_match = 0
            #print dir
            path = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\all_fna\\all.fna\\'
            path1 = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\all_ffn\\all.ffn\\'
            dir = dir.strip('\n')
            dirpath = path + dir
            dirpath1 = path1 + dir
            #print "VIRUS = "+dirpath
            i = 'grinder-reads.fa'
            file_path = dirpath + '\\' + i
            file_handle = open(file_path, 'r')
            for seq in SeqIO.parse(file_handle, "fasta"):
                GC_content = 0
                num_gene = 0
                length = 0
                inpstr = seq.description
                complement_in_grinder = 'position=complement' in inpstr
                p = re.compile('[0-9]+\.\.[0-9]+')
                x = p.findall(inpstr)
                y = x[0].split('..')
                gmin = y[0]
                gmax = y[1]
                #print inpstr
                #print y
                matched = 0
                for j in os.listdir(dirpath1):
                    if j.endswith(".ffn"):
                        file_path1 = dirpath1 + '\\' + j
                        file_handle1 = open(file_path1, 'r')
                        for seq1 in SeqIO.parse(file_handle1, "fasta"):
                            inpstr1 = seq1.description
                            q = re.compile('[0-9]+\-[0-9]+ ')
                            z = q.findall(inpstr1)
                            u = z[0].split('-')
                            smin = u[0]
                            smax = u[1]
                            #print u
                            complement_in_seq = '|:c' in inpstr1
                            if (complement_in_grinder and complement_in_seq
                                ) or (not (complement_in_grinder)
                                      and not (complement_in_seq)):
                                if gmin <= smin and gmax >= smax:
                                    matched = 1
                                    na_seq = seq1.seq
                                    GC_content = GC_content + GC(str(na_seq))
                                    num_gene = num_gene + 1
                                    length = length + len(str(na_seq))
                if matched:
                    GC_avg = round(GC_content / num_gene, 2)
                    gc_list.append(GC_avg)
                    avg_gene_length = length / num_gene
                    gene_len_list.append(avg_gene_length)
                    num_gene_list.append(num_gene)
                    name_list.append(prepend_str + str(contig_num))
                    contig_num = contig_num + 1
                else:
                    no_match = no_match + 1
                    #print "Didnt match for ",contig_num
            #print pvt_list
            #print "Not matched contigs = ", no_match
        f.close()
        eg_num = eg_num + 1
    ret_dict = {
        'Name': name_list,
        'GCcontent': gc_list,
        'num_of_gene': num_gene_list,
        'length_of_gene': gene_len_list
    }
    return ret_dict
コード例 #21
0
headers_list = []

for seq in SeqIO.parse( inputFasta , "fasta" ):
    seq_dict[seq.id] = seq.seq
    header = seq.id
    headers_list.append( header )    


#---------/calcular gc y length/-------

gc_cont = []
lengths = []

for seqid in seq_dict:
    gc = GC( seq_dict[seqid] )
    gc_cont.append( gc )
    length = len( seq_dict[seqid] )
    lengths.append( length )
        
#-------------------------------------


num_seqs = len(headers_list)  

command = [ "blastn" , "-query" , sys.argv[1], "-subject" , sys.argv[1], "-outfmt",
           "6 qseqid sseqid pident", "-out", "blast_out.tmp" ]
subprocess.call(command)

pident_list = []
コード例 #22
0
def get_feature(FA,heDAT,outFILE):
    f_out = open(outFILE+".feature", "w")
    test_count = 0
    transcript_len = dict()
    ### get feature from lnc_fasta
    #### run txCdsPredict on the input FASTA file ####
    cmd = "txCdsPredict " + FA + " -anyStart tmp.cds"
    os.system(cmd)
    cds_len = dict()
    cds_score = dict()
    temp=open("tmp.cds")
    for line in temp:
        line_array = line.split()
        id_array = line_array[0].split('|')
        start=0
        end=0
        trans_id = id_array[0]
        pred_start = int(line_array[1])
        pred_end = int(line_array[2])   
        pred_len = pred_end-pred_start
        cds_len[trans_id] = pred_len
        cds_score[trans_id] = float(line_array[5])
    temp.close()
    os.system("rm tmp.cds")
    ############ end of running txCdsPredict ################
    #### extract hexamer (CPAT)####
    #build hexamer table from hexamer frequency file
    coding={}
    noncoding={}    
    start_codons='ATG'
    stop_codons='TAG,TAA,TGA'
    for line in open(heDAT):
        line = line.strip()
        fields = line.split()
        if fields[0] == 'hexamer':continue
        coding[fields[0]] = float(fields[1])
        noncoding[fields[0]] =  float(fields[2])
    ####end extract hexamer ####
    #### extract peptide_length,Fickett_score,ORF_integrity (CPC2) ####
    strand = "+"
    strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I)
    ptU = re.compile("U",re.I)
    fickett_obj = Fickett()
    ####end extract peptide_length,Fickett_score,ORF_integrity (CPC2) ####
    ## read the FASAT file of transcripts
    f_in = open(FA)
    for record in SeqIO.parse(f_in, "fasta"):
        ID = record.id
        first = ID.split("|")
        seq = record.seq
        seq_len = len(seq)
        ACG_mer = seq.count("ACG")*100/float(seq_len-2)
        AGC_mer = seq.count("AGC")*100/float(seq_len-2)
        CAG_mer = seq.count("CAG")*100/float(seq_len-2)
        CAT_mer = seq.count("CAT")*100/float(seq_len-2)
        CCA_mer = seq.count("CCA")*100/float(seq_len-2)
        CGG_mer = seq.count("CGG")*100/float(seq_len-2)
        CGT_mer = seq.count("CGT")*100/float(seq_len-2)
        GAC_mer = seq.count("GAC")*100/float(seq_len-2)
        GAG_mer = seq.count("GAG")*100/float(seq_len-2)
        GAT_mer = seq.count("GAT")*100/float(seq_len-2)
        GGC_mer = seq.count("GGC")*100/float(seq_len-2)
        GGG_mer = seq.count("GGG")*100/float(seq_len-2)
        TAC_mer = seq.count("TAC")*100/float(seq_len-2)
        TAG_mer = seq.count("TAG")*100/float(seq_len-2)
        TCA_mer = seq.count("TCA")*100/float(seq_len-2)
        #Translating nucleotides to peptide sequences according to frame shift
        frame_0 = frame_translation(seq, 0, genetic_code=1)
        stop_count_0 = frame_0.count("*")
        frame_1 = frame_translation(seq, 1, genetic_code=1)
        stop_count_1 = frame_1.count("*")
        frame_2 = frame_translation(seq, 2, genetic_code=1)
        stop_count_2 = frame_2.count("*")
        stop = (stop_count_0, stop_count_1, stop_count_2)
        std_stop = numpy.std(stop)
        seqRNA = ptU.sub("T",str(seq).strip())
        seqRNA = seqRNA.upper()
        seqCDS,start_pos,orf_strand,orf_fullness = FindCDS(seqRNA).longest_orf(strand)
        '''seqCDS:longest ORF'''
        seqprot = mRNA_translate(seqCDS)
        pep_len = len(seqprot) #pep_len = len(seqprot.strip("*"))
        newseqprot = strinfoAmbiguous.sub("",str(seqprot))
        '''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence'''
        fickett_score = fickett_obj.fickett_value(seqRNA)
        protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*")))
        if pep_len > 0:
            isoelectric_point = protein_param(protparam_obj)
        else:
            orf_fullness = -1
            isoelectric_point = 0.0   
        hexamer = extract_feature_from_seq(seq = seq, stt = start_codons,stp = stop_codons,c_tab=coding,g_tab=noncoding)
        # print features
        #print("%s"%first[0], end='\t', file=f_out)
        cds = cds_len[first[0]]
        len_perc = float(cds)/seq_len
        print("%0.2f"%len_perc, end='\t', file=f_out)
        print("%s"%str(fickett_score), end='\t', file=f_out)
        print("%s"%str(hexamer), end='\t', file=f_out)
        print("%d"%cds_score[first[0]], end='\t', file=f_out)
        print("%0.4f"%CGG_mer, end = "\t", file=f_out)
        print("%0.4f"%TAG_mer, end = "\t", file=f_out)
        print("%0.2f"%GC(seq), end='\t', file=f_out)
        print("%.6f"%std_stop, end='\t', file=f_out)
        print("%s"%str(isoelectric_point), end='\t', file=f_out)
        print("%0.4f"%ACG_mer, end = "\t", file=f_out)
        print("%0.4f"%GGC_mer, end = "\t", file=f_out)
        print("%d"%seq_len, end='\t', file=f_out)
        print("%0.4f"%CGT_mer, end = "\t", file=f_out)
        print("%0.4f"%AGC_mer, end = "\t", file=f_out)
        print("%0.4f"%GAC_mer, end = "\t", file=f_out)
        print("%0.4f"%GGG_mer, end = "\t", file=f_out)
        print("%0.4f"%TCA_mer, end = "\t", file=f_out)
        print("%0.4f"%CAT_mer, end = "\t", file=f_out)
        print("%s"%str(orf_fullness), end='\t', file=f_out)
        print("%0.4f"%CAG_mer,file=f_out)
    f_out.close()
    f_in.close()
コード例 #23
0
def meanGC(lst): #takes in a list of sequence
    gcList=[]
    for seq in lst:
        gcList.append(GC(seq))
    return ss.mstats.gmean(gcList)
コード例 #24
0
# This script will take a fasta file (with only one fasta sequence, at this moment) and output a file with the GC content of specified sliding window size. Input the fasta file name and the window size as arguments at command line.
import sys, os
from Bio import SeqIO
from Bio.SeqUtils import GC

raw_file=open(sys.argv[1], "r")
window_GC = open(sys.argv[2], "w")
window_size=sys.argv[3]

pieces=[]
#window_GC=[]
for rec in SeqIO.parse(raw_file, "fasta"):
	total_size=len(rec.seq)
	window_size=int(window_size)
	chunksize=total_size//window_size
	for pos in range(0, total_size, window_size):
		pieces.append(str(rec.seq[pos:pos+window_size]))
#print pieces
	for small_chunk in pieces:
		gc_content=GC(small_chunk)
		window_GC.write(str(gc_content)+"\n")
#print chunksize
#print total_size//2000
window_GC.close()
#print window_GC
コード例 #25
0
oup_GC.write("group\tspecies\tGC\tGC1\tGC2\tGC3\n")

for inl in folder:
    print inl
    group = inl.split(".aln")[0]
    ortho_groups.append(group)
    os.system("trimal -in %s/%s -out trimmed/%s.aln -gt 0.9" %
              (align_folder, inl, inl))  #generate a trimmed alignment
    # to calculate GC content on conserved sections

    align = AlignIO.read("trimmed/%s.aln" % inl, "clustal")

    for seq in align:
        species = seq.id
        seqstring = str(seq.seq).replace("-", "")
        GCcont = GC(seqstring)
        GC1 = GC(seqstring[0::3])
        GC2 = GC(seqstring[1::3])
        GC3 = GC(seqstring[2::3])

        main_dict[species][group] = (GCcont, GC1, GC2, GC3)
        print species, group, GCcont

ortho_groups = set(ortho_groups)

for i in ortho_groups:
    for spec in species_list:
        oup_GC.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\n" %
                     (i, spec, main_dict[spec][i][0], main_dict[spec][i][1],
                      main_dict[spec][i][2], main_dict[spec][i][3]))
コード例 #26
0
def result_tools3(request):
    input_seq = request.POST.get('tool1', 'default')
    rec1 = Seq(input_seq)
    ans = GC(rec1)
    params = {'res': ans}
    return render(request, 'mysite/result_tools.html', params)
コード例 #27
0
def return_genbank_dict(gb_file, key='annotation', seq_type='amino_acid'):
    """Overview: This function will return a dictionary generated from a genbank file with key value supplied by caller.
       Returns: A dictionary created by the supplied genbank file (gb_file) indexed off the key value supplied.
       Default: The deafult key is locus, and this is generally the most useful key type since it is garanteed to be 
       unique within the genbank file. This condition is not necessarily true for any other attribute.
   """
    result = {}
    seq_record = SeqIO.parse(open(gb_file), "genbank").next()
    accession = seq_record.annotations['accessions'][0].split('.')[0]
    common_name = seq_record.annotations['organism'].replace(' ', '_')
    result.update({'accession': accession})
    result.update({'common_name': common_name})
    cnt = 0
    # loop over the genbank file
    unk_cnt = 1
    for fnum, feature in enumerate(seq_record.features):
        # here i simply check the gene coding type, and identify them in a way that can be used later.
        if feature.type == 'CDS' or feature.type == 'ncRNA' or feature.type == 'tRNA' or feature.type == 'mRNA' or feature.type == 'rRNA':
            start = feature.location.start
            stop = feature.location.end
            strand = feature.strand
            synonyms = 'NONE'
            if 'gene_synonym' in feature.qualifiers:
                synonyms = ':'.join(
                    feature.qualifiers['gene_synonym'][0].replace(
                        ' ', '').split(';'))
            try:
                locus = feature.qualifiers['locus_tag'][0]
            except:
                try:
                    locus = feature.qualifiers['gene'][0]
                except:
                    locus = ''
                    print 'No locus associated. This should never be invoked meaning you are proper fracked. (The gbk file has an error).'
            try:
                gene = feature.qualifiers['gene'][0]
            except:
                gene = 'unknown'
            try:
                seq = feature.qualifiers['translation']
                seq_type = 'Protein'
            except:
                cnt = cnt + 1
                seq = seq_record.seq[start:stop]
                seq_type = feature.type
                if feature.type == 'CDS':
                    seq_type = 'Pseudo_Gene'
            gc = "%2.1f" % GC(seq_record.seq[start:stop])
            #print synonyms
            #method = "exact"
            if key == 'locus':
                result.update({locus: (locus, gene, seq, seq_type, synonyms)})
            elif key == 'annotation':
                if gene == 'unknown':
                    new_gene = 'unknown_' + str(unk_cnt)
                    header = '|'.join([
                        accession, common_name, locus, gene,
                        str(start),
                        str(stop),
                        str(strand), seq_type, synonyms, gc
                    ])
                    result.update({new_gene: [header, ''.join(seq)]})
                    unk_cnt += 1
                else:
                    header = '|'.join([
                        accession, common_name, locus, gene,
                        str(start),
                        str(stop),
                        str(strand), seq_type, synonyms, gc
                    ])
                    result.update({gene: [header, ''.join(seq)]})
                    #################################################
                    # New code to overcome some issues with         #
                    # RegulonDB. we will store synonym data as well #
                    # which improves operon recovery slightly.      #
                    #################################################
                    if synonyms != 'NONE':
                        for syn in synonyms.split(':'):
                            header = '|'.join([
                                accession, common_name, locus, gene,
                                str(start),
                                str(stop),
                                str(strand), seq_type, synonyms, gc
                            ])
                            result.update({syn: [header, ''.join(seq)]})

    #print 'The number of non-protein regions in %s is: %i.' % (common_name, cnt)
    return result
コード例 #28
0
    def __call__(self, best_solution: PASSolution,
                 mutations: [PASMutationSite],
                 sequences: PASSequences) -> [PASResult]:
        """
        Returns list of results
        """

        # two shifted iterators to iterate over fragment and next fragment in the same time
        # in purpose to calculate overlaps
        frag_current_it = iter(best_solution.get_fragments())
        frag_lagged_it = iter(best_solution.get_fragments())
        next(frag_lagged_it)
        results = []
        goi_offset = sequences.get_goi_offset()
        # sorted list of all mutations sites
        mutation_sites = list(set([mut.position for mut in mutations]))
        mutation_sites.sort()

        # creating the output values for every fragment
        for i, frag_current in enumerate(frag_current_it):

            # getting oligos for a fragment, and fragment parameters
            generator = OligoGenerator(self.config,
                                       self.is_mutations_as_codons,
                                       self.config.organism)
            oligos_group = generator(
                frag_current.get_sequence(best_solution.gene.sequence),
                mutations, frag_current, goi_offset, 250)

            fragment_sequence = frag_current.get_sequence(
                best_solution.gene.sequence)

            # getting list of mutations on a fragment a prepare it in a desired json format
            mutation_sites_on_fragment = [
                site for site in mutation_sites
                if ((goi_offset + (site - 1) * 3) >= frag_current.get_start()
                    and (goi_offset +
                         (site - 1) * 3 + 2) <= frag_current.get_end())
            ]
            mutations_on_fragment = [
                mut for mut in mutations
                if mut.position in mutation_sites_on_fragment
            ]
            mutations_on_fragment_formatted = self.combine_mutations_list(
                frag_current, oligos_group, mutation_sites_on_fragment,
                mutations_on_fragment, fragment_sequence, goi_offset)
            list_oligos = combine_oligos_list(oligos_group,
                                              mutations_on_fragment_formatted,
                                              mutation_sites_on_fragment,
                                              goi_offset, frag_current)
            # getting overlap and its parameters
            try:
                frag_next = next(frag_lagged_it)
                overlap = frag_current.get_overlap_seq(
                    frag_next, sequences.get_full_sequence())
                overlap_Tm = best_solution.temp_calculator(overlap)
                overlap_GC = GC(overlap)
                overlap_length = len(overlap)
            except:  # when lagged iterator returns None set all overlaps info to None
                overlap = overlap_Tm = overlap_GC = overlap_length = None

            # every fragment at even position should be reverse complement of the original sub-sequence
            # doing it here because previous code requires fragment in original forward direction
            if i % 2 == 1:
                for oligo in list_oligos:
                    oligo.make_reverse_complement()
                fragment_sequence = reverse_complement(fragment_sequence)
            # combining the results together
            result_oligo = PASResult(fragment=fragment_sequence,
                                     start=frag_current.get_start(),
                                     end=frag_current.get_end(),
                                     length=frag_current.get_length(),
                                     overlap=overlap,
                                     overlap_Tm=overlap_Tm,
                                     overlap_GC=overlap_GC,
                                     overlap_length=overlap_length,
                                     mutations=mutations_on_fragment_formatted,
                                     oligos=list_oligos)
            results.append(result_oligo)

        # preparing input data for final json
        # list of all mutation on a gene in a desired json format

        # returning output json
        return results
コード例 #29
0
ファイル: xbb_translations.py プロジェクト: yuanzhw/biopython
 def gc(self, seq):
     """Calculate GC content in percent (0-100)."""
     return GC(seq)
コード例 #30
0
def draw_gc(sumgene, dwg, gc_color):
    #gc kmer number
    unit_num = 300
    unit_len = int(sumgene.len / unit_num)
    #first coordinates
    start_unit = 0
    end_unit = unit_len - 1
    mid_unit = (end_unit - start_unit) / 2
    #radius
    gcc_r = 430
    gcs_r = 230
    gcc_range = (330, 530)
    gcs_range = (130, 330)
    gcc_mean = GC(sumgene.seq)
    gc_contents = []
    gc_skews = []
    #compute gc
    while (unit_num > 0):
        gc_content = GC(sumgene.seq[start_unit:end_unit])
        gcc_variance = gc_content - gcc_mean
        gc_contents.append(gcc_variance)
        gc_skew = GC_skew(sumgene.seq[start_unit:end_unit], window=unit_len)[0]
        gc_skews.append(gc_skew)
        start_unit += unit_len
        if end_unit + unit_len <= sumgene.len:
            end_unit += unit_len
        else:
            end_unit = sumgene.len
        unit_num -= 1
    gcc_max = max(gc_contents)
    gcc_min = min(gc_contents)
    gcs_max = max(gc_skews)
    gcs_min = min(gc_skews)
    gcc_scal = (gcc_range[1] - gcc_range[0]) / (gcc_max - gcc_min)
    gcs_scal = (gcs_range[1] - gcs_range[0]) / (gcs_max - gcs_min)
    gc_count = 0

    unit_num = 300
    start_unit = 0
    end_unit = unit_len - 1
    mid_unit = (end_unit - start_unit) / 2
    while (unit_num > 0):
        # draw gc_content
        c_pos = position_mapping(sumgene, [start_unit, end_unit], gcc_r)
        gcc_nr = (gc_contents[gc_count] - gcc_min) * gcc_scal + gcc_range[0]
        cm_pos = position_mapping(sumgene, [mid_unit, 0], gcc_nr)
        points = [(1500 + c_pos[0], 1500 - c_pos[1]),
                  (1500 + c_pos[2], 1500 - c_pos[3]),
                  (1500 + cm_pos[0], 1500 - cm_pos[1])]
        if gcc_nr >= gcc_r:
            dwg.add(dwg.polygon(points, fill=gc_color[0], stroke_width=0))
        else:
            dwg.add(dwg.polygon(points, fill=gc_color[1], stroke_width=0))

        # draw gc_skew
        s_pos = position_mapping(sumgene, [start_unit, end_unit], gcs_r)
        gcs_nr = (gc_skews[gc_count] - gcs_min) * gcs_scal + gcs_range[0]
        sm_pos = position_mapping(sumgene, [mid_unit, 0], gcs_nr)
        points = [(1500 + s_pos[0], 1500 - s_pos[1]),
                  (1500 + s_pos[2], 1500 - s_pos[3]),
                  (1500 + sm_pos[0], 1500 - sm_pos[1])]
        if gcs_nr >= gcs_r:
            dwg.add(dwg.polygon(points, fill=gc_color[2], stroke_width=0))
        else:
            dwg.add(dwg.polygon(points, fill=gc_color[3], stroke_width=0))

        start_unit += unit_len

        if end_unit + unit_len <= sumgene.len:
            end_unit += unit_len
        else:
            end_unit = sumgene.len

        mid_unit += unit_len
        unit_num -= 1
        gc_count += 1
    return dwg