コード例 #1
0
def create_res_df_from_bam(input_file, reference):
    species_list, chr_length_list, read_count_list, basecount_list, gc_ref_list, gc_reads_list = [], [], [], [], [], []

    for seq_record in SeqIO.parse(reference, 'fasta'):
        # joining all reads
        joined_reads = ''.join([
            read.query_sequence
            for read in pysam.AlignmentFile(input_file, 'rb').fetch(
                contig=seq_record.name)
        ])

        # appending to all Lists
        species_list.append(seq_record.name)
        chr_length_list.append(len(seq_record.seq))
        read_count_list.append(
            pysam.AlignmentFile(input_file,
                                'rb').count(contig=seq_record.name))
        gc_ref_list.append(SeqUtils.GC(seq_record.seq))
        gc_reads_list.append(SeqUtils.GC(joined_reads))
        basecount_list.append(sum([len(joined_reads)]))

    # create and return dataframe
    return pd.DataFrame(
        data={
            'species': species_list,
            'chr_length': chr_length_list,
            'gc_ref': gc_ref_list,
            'gc_reads': gc_reads_list,
            'read_count': read_count_list,
            'basecount': basecount_list,
        })
コード例 #2
0
def add_alignment(aln, result):
    result["n_taxa"] = aln.ntax
    result["n_sites"] = aln.nchar
    result["n_datablocks"] = len(aln.charpartitions["loci"])

    # get GC skew
    aln_seq = ''.join([str(x) for x in aln.matrix.values()])
    gc = SeqUtils.GC(aln_seq)

    # ACGT proportions
    A = float(aln_seq.count('A'))
    T = float(aln_seq.count('T'))
    G = float(aln_seq.count('G'))
    C = float(aln_seq.count('C'))
    sum_count = A + T + G + C

    # gaps
    gaps = float(aln_seq.count('?') + aln_seq.count('-') + aln_seq.count('N'))
    gap_proportion = gaps / float(len(aln_seq))

    result["gc_proportion"] = gc
    result["gap_proportion"] = gap_proportion
    result["a_proportion"] = A / sum_count
    result["c_proportion"] = C / sum_count
    result["g_proportion"] = G / sum_count
    result["t_proportion"] = T / sum_count

    return result
コード例 #3
0
def generate_wide_table(all_fastas):
    global args, output_handle
    basis = [['A', 'T', 'G', 'C']] * args.kmer_length
    all_kmers = sorted(["".join(x) for x in tuple(itertools.product(*basis))])

    records_to_kmer = {}
    for f in all_fastas:
        logger.debug("Processing file %s" % (f))
        for record in SeqIO.parse(f, "fasta", generic_dna):
            logger.debug("Processing sequence %s" % (record.description))

            seq = str(record.seq)
            fasta_keys = [f, record.description, str(len(seq))]
            # Add additional features to the sequence
            fasta_keys.append(str(SeqUtils.GC(record.seq)))

            #
            fasta_keys = tuple(fasta_keys)
            records_to_kmer[fasta_keys] = collections.defaultdict(int)
            for i in range(0, len(seq) - args.kmer_length):
                kmer = seq[i:i + args.kmer_length]
                records_to_kmer[fasta_keys][kmer] += 1
    if not args.append:
        print >> output_handle, "\t".join(
            ["path", "sequence_description", "sequence_length", "GC"] +
            all_kmers)
    for k, kmer_values in records_to_kmer.items():
        all_values = list(k)
        all_values.extend(map(str, [kmer_values.get(x, 0) for x in all_kmers]))
        # print len(all_values)
        print >> output_handle, "\t".join(all_values)
コード例 #4
0
ファイル: fanalyzer.py プロジェクト: jsgounot/CFreecW
def extract_gc_content(fasta, window, step):
    fdata = SeqIO.parse(fasta, "fasta")
    for record in fdata:
        sequence = record.seq
        for i in range(0, len(sequence), step):
            subseq = sequence[i:i + window]
            yield SeqUtils.GC(subseq)
コード例 #5
0
def get_gen_stats(gbk_list):
    # NOTE: for now, the coding density do not take overlapping genes
    # into account. Depending on how many of them are present in a genome,
    # this may cause an overestimation of the coding density, as each
    # CDS will be accounted for separately (and a same region will be counted
    # several times).

    hsh_gen_stats = {}

    for gbk_file in gbk_list:
        ttl_length = 0
        gc_cum = 0
        cds_length = 0
        for record in SeqIO.parse(gbk_file, "genbank"):
            ttl_length += len(record)
            gc_cum += SeqUtils.GC(record.seq) * len(record)
            for fet in record.features:
                if fet.type in ["CDS", "tmRNA", "rRNA", "ncRNA", "tRNA"]:
                    if "pseudo" in fet.qualifiers:
                        continue
                    location = fet.location

                    # allow to take compoundlocation into account
                    for part in location.parts:
                        cds_length += part.end - part.start
        gbk_shortened = gbk_file.replace(".gbk", "")
        hsh_gen_stats[gbk_shortened] = (float(gc_cum) / ttl_length,
                                        float(cds_length) / ttl_length,
                                        ttl_length)
    return hsh_gen_stats
コード例 #6
0
def gene_feature(Y):
    """
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    """

    gene_names = Y["Target gene"]

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, "DNA")

    everything = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pd.DataFrame(
        data=everything,
        index=gene_names.index,
        columns=[
            "gene length",
            "gene GC content",
            "gene temperature",
            "gene molecular weight",
        ],
    )
    return df
コード例 #7
0
ファイル: main.py プロジェクト: 0xrutvij/Archive
def gdps(str_list):
    ret_list = []
    i = 50
    for strng in str_list:
        ret_list.append((i, SeqUtils.GC(strng)))
        i = i + 100
    return ret_list
コード例 #8
0
def generate_long_table(all_fastas):
    global args, output_handle
    if not args.append:
        print >> output_handle, "\t".join([
            "path", "sequence_description", "sequence_length", "GC", "kmer",
            "count"
        ])
    for f in all_fastas:
        logger.debug("Processing file %s" % (f))
        for record in SeqIO.parse(f, "fasta", generic_dna):
            kmer_count = collections.defaultdict(int)

            logger.debug("Processing sequence %s" % (record.description))

            seq = str(record.seq)
            fasta_keys = [f, record.description, str(len(seq))]
            # Add additional features to the sequence
            fasta_keys.append(str(SeqUtils.GC(record.seq)))

            #
            # fasta_keys=tuple(fasta_keys)
            # kmer_count[fasta_keys]=collections.defaultdict(int)
            for i in range(0, len(seq) - args.kmer_length):
                kmer = seq[i:i + args.kmer_length]
                if args.star:
                    kmer = list(kmer)
                    for i in range(2, args.kmer_length, 3):
                        kmer[i] = "*"
                    kmer = "".join(kmer)

                kmer_count[kmer] += 1
            for kmer, count in kmer_count.items():
                print >> output_handle, "\t".join(fasta_keys +
                                                  [kmer, str(count)])
コード例 #9
0
 def __init__(self, file, fastaRecord):
     super(SequenceStat, self).__init__()
     self.file = file
     self.length = len(fastaRecord.seq)
     self.description = fastaRecord.description
     self.gc = SeqUtils.GC(fastaRecord.seq)
     self.crc32 = CheckSum.crc32(fastaRecord.seq)
コード例 #10
0
def main():
    args = fetch_args()
    utility.add_tmp_dir(args)
    utility.check_input(args)
    print("\n## Computing mean contig GC content")
    contigs = {}
    for id, seq in utility.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.gc = round(SeqUtils.GC(seq), 2)
        contigs[id] = contig
    mean = np.mean([c.gc for c in contigs.values()])
    print("\n## Computing per-contig deviation from mean")
    for contig in contigs.values():
        contig.values = {}
        contig.values["delta"] = abs(contig.gc - mean)
    print("\n## Identifying outlier contigs")
    flagged = []
    for contig in contigs.values():
        if contig.values["delta"] > args["cutoff"]:
            flagged.append(contig.id)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"   {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
コード例 #11
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, 'DNA')

    all = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all,
                          index=gene_names.index,
                          columns=[
                              'gene length', 'gene GC content',
                              'gene temperature', 'gene molecular weight'
                          ])
    return df
コード例 #12
0
ファイル: summary.py プロジェクト: emmaver/IBP19-20
def plot(unmappeddict, unmap_stats, out):
    """ Generates boxplot, distribution plots and join plots from the missing regions summary statistics.
        They are saved in the output directory as jpg images.
    
    Parameters
    ----------
    unmappeddict: dict
        Dictionary of the coordinates and sequences of the unmapped regions
    unmap_stats: dataframe
        Table containing the unmapped regions summary statistics
    out: str
        Output directory
        
    """

    gc_content = list()
    regions_length = list()
    for key, values in unmappeddict.items():
        gc_content.append(SeqUtils.GC(values))
        regions_length.append(len(values))

    plt.figure(figsize=(10, 10))
    sns.set(style='white', font_scale=2)
    fig_joint = sns.jointplot(regions_length, gc_content, kind='hex', height=7)
    fig_joint.set_axis_labels(xlabel='Length', ylabel='GC Content')
    fig_joint.savefig(os.path.join(out, 'gc_length_joint_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.3)
    fig_gc = sns.distplot(gc_content, hist=True, rug=False, color='red')
    fig_gc.set(xlabel='GC Content')
    fig_gc.set_title('Distribution of GC Content')
    sns.despine()
    save = fig_gc.get_figure()
    save.savefig(os.path.join(out, 'gc_content_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.3)
    fig_length = sns.distplot(regions_length,
                              hist=True,
                              rug=False,
                              color='green')
    fig_length.set(xlabel='Length')
    fig_length.set_title('Distribution of Length')
    sns.despine()
    save = fig_length.get_figure()
    save.savefig(os.path.join(out, 'length_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.2)
    ax = sns.boxplot(data=unmap_stats.iloc[:, 3:24], palette='Spectral')
    ax.set_xlabel('Translated Codons')
    ax.set_ylabel('Mean Percentage per Frame (%)')
    sns.despine()
    save = ax.get_figure()
    save.savefig(os.path.join(out, 'codons_missing.jpg'))
    plt.clf()
コード例 #13
0
def get_Seq_ORF_features(file_path,input_file,model):
    seq_id = []  
    features_dict = {}
    transcript_sequences = []
    for record in SeqIO.parse(input_file, "fasta"):
        name = record.id
        name = name.lower()
        seq_id.append(name)
        seq = record.seq
        transcript_sequences.append(seq)
        features_dict[name] = {}
        features_dict[name]["length"] = len(record.seq)   
        G_C = SeqUtils.GC(record.seq)
        features_dict[name]["G+C"] = G_C
        insta_fe,PI_fe,gra_fe = PP.param(seq)
        Len,Cov,inte_fe = leng.len_cov(seq)
        features_dict[name].update({"ORF-integrity":inte_fe,"ORF-coverage":Cov,"Instability":insta_fe,"PI":PI_fe,"Gravy":gra_fe})
        A,T,G,C,AT,AG,AC,TG,TC,GC,A0,A1,A2,A3,A4,T0,T1,T2,T3,T4,G0,G1,G2,G3,G4,C0,C1,C2,C3,C4 = CTD(seq)
        features_dict[name].update({'A':A,'T':T,'G':G,'C':C,'AT':AT,'AG':AG,'AC':AC,'TG':TG,'TC':TC,'GC':GC,'A0':A0,'A1':A1,'A2':A2,'A3':A3,'A4':A4,'T0':T0,'T1':T1,'T2':T2,'T3':T3,'T4':T4,'G0':G0,'G1':G1,'G2':G2,'G3':G3,'G4':G4,'C0':C0,'C1':C1,'C2':C2,'C3':C3,'C4':C4})
    os.system("python3 "+file_path+"/feamodule/cpat.py -g "+input_file+" -o temp_cpat.txt -x "+model_reference[model][1])  #Use cpat to get fickett , hexamer , ORF
    with open("temp_cpat.txt.dat", "r") as tabular:
        cpat_reader = csv.reader(tabular, delimiter=("\t"))
        for row in cpat_reader:
            name = row[0]
            name = name.lower() 
            ORF = float(row[2]) 
            fickett = float(row[3])
            hexamer = float(row[4])
            features_dict[name]["ORF"] = ORF  
            features_dict[name]["fickett"] = fickett 
            features_dict[name]["hexamer"] = hexamer  
    os.system("rm temp_cpat.txt.dat")
    return features_dict,seq_id,transcript_sequences
コード例 #14
0
ファイル: summary.py プロジェクト: 0mician/SASpector
def refstats(reference, mappedlocations, unmappedlocations, conflictlocations, reverselocations, unmappeddict):
    """Generates summary statistics for reference genome based on the mapped, unmapped and conflict regions
    
    Parameters
    ----------
    reference: str
        The file location of the reference FASTA file
    mappedlocations: dataframe
        Coordinates of the mapped regions in the reference sequence
    unmappedlocations: dataframe
         Coordinates of the unmapped regions in the reference sequence
    conflictlocations: dataframe
        Coordinates of the conflict regions in the reference sequence
    reverselocations: dataframe
        Coordinates of the mapped reverse complement regions in the reference sequence
    unmappeddict: dataframe
        Dictionary of the coordinates and sequences of the unmapped regions
    
    Returns
    -------
    refstats_t: dataframe
        Table containing the reference summary statistics
    
        
    """
    # Calculate genome fraction
    sum_map = 0
    for i in range(0, mappedlocations.shape[0]):
        sum_map = sum_map + abs(mappedlocations.iloc[i,1] - mappedlocations.iloc[i,0])
    
    sum_confl = 0
    for i in range(0, conflictlocations.shape[0]):
        sum_confl = sum_confl + abs(conflictlocations.iloc[i,1] - conflictlocations.iloc[i,0])
        
    sum_rev = 0
    for i in range(0, reverselocations.shape[0]):
        sum_rev = sum_rev + abs(reverselocations.iloc[i,1] - reverselocations.iloc[i,0])
        
    total_map = sum_map + sum_confl + sum_rev
    
    sum_unmap = 0
    for i in range(0, unmappedlocations.shape[0]):
        sum_unmap = sum_unmap + abs(unmappedlocations.iloc[i,1] - unmappedlocations.iloc[i,0])
    
    read = SeqIO.read(reference, format = 'fasta')
    refstats_dict = dict()
    refstats_dict = [{'GCContent': SeqUtils.GC(read.seq),
                     'Length': len(str(read.seq)),
                     'NumberMappedRegions': mappedlocations.shape[0] + reverselocations.shape[0] + conflictlocations.shape[0],
                     'NumberUnmappedRegions': unmappedlocations.shape[0],
                     'FilteredUnmappedRegions': len(unmappeddict),
                     'FractionMapped': (total_map/len(str(read.seq)))*100,
                     'FractionUnmapped': (sum_unmap/len(str(read.seq)))*100}]
    
    # Create reference summary dataframe
    refstats_t = pd.DataFrame.from_dict(refstats_dict)
    refstats_t.reset_index(drop = True, inplace = True)
    refstats_t.sort_index(inplace = True)
    
    return refstats_t
コード例 #15
0
def get_stats_from_contigs(contigs_fasta):
    """
    Use BioPython parser and GC calculator to get contig lengths and
    GCs from contigs fasta
    """

    # initialize lists
    contigs = []
    lengths = []
    gcs = []

    # loop over fasta records (this is 2-3 times faster than SeqIO.parse)
    # (and only marginally slower than my custom built parser.)
    with open(contigs_fasta, 'r') as CF:
        for title, sequence in SeqIO.FastaIO.SimpleFastaParser(CF):
            # parse title with RegEx
            contig = title.split(None, 1)[0]
            length = len(sequence)
            contigs.append(contig)
            lengths.append(length)
            gcs.append(SeqUtils.GC(sequence))

    # convert to DataFrame and return
    return pandas.DataFrame({'contig': contigs,
                             'length': lengths,
                             'GC': gcs}).set_index('contig')
コード例 #16
0
def candidates_for_seq(seq, descriptor, GC_requirement=[0, 100]):
    candidates = []
    i = 0
    while i < len(seq):
        nextPAM = seq[i:].find(PAM_SEQ)
        if nextPAM == -1 or (i + nextPAM + len(PAM_SEQ) +
                             SPACER_LENGTH) > len(seq):
            i += 10000000
            break

        targetSeq = seq[i + nextPAM + len(PAM_SEQ):i + nextPAM + len(PAM_SEQ) +
                        SPACER_LENGTH]
        GC_content = SeqUtils.GC(targetSeq)
        if GC_content < GC_requirement[0] or GC_content > GC_requirement[1]:
            i += nextPAM + 1
            continue
        name = descriptor + str(i + nextPAM + len(PAM_SEQ))

        target = SeqRecord(targetSeq, id=name, name=name, description=name)
        candidate = {
            'name': target.id,
            'seqrec': target,
            'location': i + nextPAM + len(PAM_SEQ)
        }
        candidates.append(candidate)
        i += nextPAM + 1
    return candidates
コード例 #17
0
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01):

    sequences = {}
    c = 0

    with open(input_file, "r") as content:

        for record in SeqIO.parse(content, "fasta"):
            c += 1

            # calculate GC content using Bio

            calc_gc = SeqUtils.GC(record.seq)

            # calculate aromaticity using Bio

            prot_seq = record.seq.translate()
            X = ProteinAnalysis(str(prot_seq))
            calc_arom = X.aromaticity()

            # so, now you can filter
            if calc_gc >= filt_gc and calc_arom >= filt_arom:
                sequences[record.id] = record.se

    # write a new fasta file with aminoacids
    records = []
    for seq_id, seq in sequences.items():
        records.append(SeqRecord(seq.translate(), id=seq_id, description=""))

    write_file = open('my_fasta', 'w')
    SeqIO.write(records, write_file, 'fasta')
    write_file.close()

    # print the percentage
    print(len(records) / c)
コード例 #18
0
ファイル: lines_GO.py プロジェクト: linocesar/projeto-lines
def createDataFromRepeatMasker(diretorio):

    path = os.path.abspath(diretorio)

    for arquivo_encode in os.listdir(path):

        if padrao.fnmatch(
                arquivo_encode,
                "Repeat_Encode*"):  #Se arquivo tratado é do Repeat Masker

            arquivo = open(path + "/" + arquivo_encode, 'r')

            for linha in arquivo:  #ler cada linha do arquivo
                isLine = tem_LINE_L1(
                    linha
                )  #busca por padrao LINE/L1 em cada linha, retorna TRUE/FALSE

                if isLine:  #Se tiver LINE
                    linha = formata_RepeatMasker(
                        linha)  #formata linha do LINE identificado
                    it = linha.split(
                        '\t'
                    )  #cria lista onde cada coluna da linha é um elemento definido
                    taxon = it[
                        4]  #coluna 5 da linha corresponde ao taxon(Aotus,Pan e etc)

                    if taxon in taxons:  #O taxon lido está presente na lista
                        encodeID = get_encodeIDFromRepeatMasker(arquivo_encode)
                        posicao_inicial = int(
                            it[5])  #posicial inicial seq de um LINE
                        posicao_final = int(
                            it[6])  #posicao final da seq de um LINE
                        tamanho_LINE = posicao_final - posicao_inicial  #tamanho de um LINE

                        if tamanho_LINE >= 5000:  #filtro de tamanho em pares de base
                            taxon_encodeID = get_taxon_encodeID(
                                taxon,
                                encodeID)  #chave de busca no taxon_encodeID_db

                            if taxon_encodeID_db.__contains__(
                                    taxon_encodeID
                            ):  #taxon_encodeID está no banco?

                                sequencia = get_Seq_From_TaxonEncodeIDdb(
                                    taxon_encodeID)
                                sequencia_LINE = get_seq_LINE(
                                    sequencia, posicao_inicial, posicao_final)
                                cabecalho_LINE = format_cabecalho_LINE(
                                    taxon, encodeID, posicao_inicial,
                                    posicao_final)
                                alvo_fasta_LINE = format_fasta_LINE(
                                    cabecalho_LINE, sequencia_LINE)

                                escreverArquivoFasta(cabecalho_LINE,
                                                     alvo_fasta_LINE)
                                taxonStat[cabecalho_LINE] = calculate.GC(
                                    alvo_fasta_LINE)
                                updateFreqTaxonEncode(taxon, encodeID)
            arquivo.close()
コード例 #19
0
 def checkgc(self, seq):
     res = True
     for it in self.items:
         if re.match(r'^gc$', it[0], re.I):
             res = self._compare(SeqUtils.GC(seq.seq), it[1], it[2])
             if not res:
                 break
     return res
コード例 #20
0
    def calculate_gc(self):
        """Calculates the GC percent in sequence.

        :return: Float number - GC percent.
        """
        if self.s_type == 'PROTEIN':
            raise TypeError('GC are not in {} sequence'.format(self.s_type))
        return SeqUtils.GC(self.seq)
コード例 #21
0
ファイル: sparna.py プロジェクト: bede/sparNA
def gc_content(asms_paths):
    asms_gc = {}
    for asm, path in asms_paths.items():
        asm_gc = []
        for record in SeqIO.parse(path, 'fasta'):
            asm_gc.append(SeqUtils.GC(record.seq) / 100)
        asms_gc[asm] = asm_gc
    return asms_gc
コード例 #22
0
def genome_seq(genome_path):
        genome=open(genome_path, 'r')
        for record in SeqIO.parse(genome, "fasta"):
                genome_sequence=str(record.seq)
        genome.close()
        print('Whole genome average GC: ' + str(SeqUtils.GC(genome_sequence)))
        print('Whole genome length: ' + str(len(genome_sequence)))        
        return genome_sequence
コード例 #23
0
 def gccontent(self):
     """@return float, GC content of sequence in %"""
     r = 0.0
     try:
         r = SeqUtils.GC(self.sequence)
     except:
         pass
     return round(r, 0)
コード例 #24
0
ファイル: lib_primer3.py プロジェクト: milw/polyoligo
 def calc_thermals(self):
     if not self.has_thermals:
         thals_obj = ThermoAnalysis()  # Initialize a primer3 thal object
         self.tm = thals_obj.calcTm(self.sequence)
         self.hairpin_tm = thals_obj.calcHairpin(self.sequence).tm
         self.homodimer_tm = thals_obj.calcHomodimer(self.sequence).tm
         self.length = len(self.sequence)
         self.gc_content = SeqUtils.GC(self.sequence)
         self.nN = self.sequence.count("N")
コード例 #25
0
def compute_stats(seq):
    stats = SeqStats
    stats.length = len(seq)
    stats.gc = SeqUtils.GC(seq)
    try:
        stats.weight = SeqUtils.molecular_weight(seq)
    except ValueError:
        stats.weight = None
    return stats
コード例 #26
0
ファイル: util.py プロジェクト: ElucidataInc/crispor
def target_genes_stats(
    genes=[
        'HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101'
    ]):
    for gene in genes:
        seq = get_gene_sequence(gene)
        if seq != None:
            print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (
                gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(
                    seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
コード例 #27
0
def get_gc():
    for contig in db['genes'].find({'type': 'contig'}):
        gc_points = []
        seq = contig['dna_seq']

        for p in range(len(seq))[500::1000]:
            gc_cont = SeqUtils.GC(seq[p - 500:p + 499])
            gc_points.append((p - 500, p + 499, gc_cont))
        if gc_points:
            yield safe_species_name(
                contig['species']), contig['contig_id'], gc_points
コード例 #28
0
ファイル: sparna.py プロジェクト: bede/sparNA
def gc_contents(asms_paths):
    '''
    Accepts path to multifasta, returns OrderedDict of sequence GC content
    '''
    gc_contents = {}
    for asm_name, asm_path in asms_paths.items():
        records = seqrecords(asm_path)
        gc_contents[asm_name] = []
        for record in seqrecords:
            gc_contents[record.id].append(SeqUtils.GC(record.seq) / 100)
    return gc_contents
コード例 #29
0
def get_gc_and_len_dict(fastafile):
    """Creates a dictionary with the fasta id as key and GC and length as keys
    for the inner dictionary."""
    out_dict = {}

    for rec in SeqIO.parse(fastafile, "fasta"):
        out_dict[rec.id] = {}
        out_dict[rec.id]["length"] = len(rec.seq)
        out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq)

    return out_dict
コード例 #30
0
ファイル: gbk2table.py プロジェクト: thisisliuqing/TPutils
    def get_one_record_features(self, one_record):
            
        feature_list = []
        for i in range(0,len(one_record.features)):
            #print one_record.features[i]
            if one_record.features[i].type == "misc_feature":
                continue
            new_feature = Feature() 
            #print  one_record.features[i]
            new_feature.type = one_record.features[i].type
            new_feature.contig = one_record.name
            new_feature.start = one_record.features[i].location.start
            new_feature.stop = one_record.features[i].location.end
            new_feature.length = len(one_record.features[i].location)
            new_feature.strand = one_record.features[i].strand
            try:
                new_feature.gene = one_record.features[i].qualifiers['gene'][0]
            except:
                pass
            try:
                gi_position=find_index("GO*",one_record.features[i].qualifiers['db_xref'])
                new_feature.gi = one_record.features[i].qualifiers['db_xref'][gi_position][3:]
            except:
                pass
         
            #geneID= one_record.features[i].qualifiers['db_xref'][1][7:]
            try:
                new_feature.locus = one_record.features[i].qualifiers['locus_tag'][0]
            except:
                pass
            try:
                new_feature.protein_id = one_record.features[i].qualifiers['protein_id'][0]
            except:
                pass
            try:
                new_feature.product = one_record.features[i].qualifiers['product'][0]
            except:
                pass
            try:
              
              new_feature.inference = one_record.features[i].qualifiers['inference']
            except:
              pass
            try:
                new_feature.translation = one_record.features[i].qualifiers['translation'][0]
            except:
                pass
            new_feature.seq = one_record.features[i].extract(self.seq)
          
            new_feature.GC = SeqUtils.GC(new_feature.seq)
            feature_list.append(new_feature)

        return feature_list