def random_human_genes(num):
    genes_set = set()
    for rec in SeqIO.parse(
            open(
                "data\UniProt full Proteomes\uniprot-proteome-full H**o sapiens (71599 proteins).fasta"
            ), 'fasta'):
        seq = str(rec.seq)
        uniqueIdentifier, entryName, proteinName, organismName, geneName = \
            utils.parse_UniProtKB_header(rec.description)
        genes_set.add(geneName)
    return random.sample(genes_set, num)
def get_human_proteins_that_contain_saar(SAAR):
    rcp_set = set()
    genes_set = set()
    for rec in SeqIO.parse(
            open(
                "data\UniProt full Proteomes\uniprot-proteome-full H**o sapiens (71599 proteins).fasta"
            ), 'fasta'):
        seq = str(rec.seq)
        uniqueIdentifier, entryName, proteinName, organismName, geneName = \
            utils.parse_UniProtKB_header(rec.description)
        genes_set.add(geneName)
        if SAAR in seq:
            rcp_set.add(geneName)
    return rcp_set
            kmer for kmer in self.repeating_non_overlapping_kmers
            if self.seq.find(kmer) +
            params.MIN_DIST_BETWEEN_REPETITIONS < self.seq.rfind(kmer)
        ]


all_proteins = list()  # all_proteins[i] = PROTEIN()_OBJECT
protein_seq = dict()  # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE

print "Reading Uniprot file and generating k-mers list for each protein..."
created_protein_names = set(
)  # prevent creation of two similar protein objects
for rec in SeqIO.parse(open(params.HUMAN_PROTEOME), 'fasta'):
    seq = str(rec.seq)
    uniqueIdentifier, entryName, proteinName, organismName, geneName = \
        utils.parse_UniProtKB_header(rec.description)
    if geneName == '':
        print 'Ignoring unknown gene: %s' % rec.description
        continue
    if geneName in created_protein_names:
        print 'Ignoring duplicate gene: %s' % rec.description
        continue
    # create a new Protein object
    created_protein_names.add(geneName)
    protein_seq[geneName] = seq
    all_proteins.append(Protein(geneName, seq, params.K))

print
print "Counted k-mer (k=%d) for %d different genes (proteins)." % (
    params.K, len(all_proteins))
Beispiel #4
0
def main(proteome_file, similar_diluted):
    all_proteins = list()  # all_proteins[i] = PROTEIN()_OBJECT
    protein_seq = dict()  # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE

    print "Reading Uniprot file and generating k-mers list for each protein..."
    created_protein_names = set(
    )  # prevent creation of two similar protein objects
    duplicate_genes_ignored = 0
    for rec in SeqIO.parse(open(proteome_file), 'fasta'):
        seq = str(rec.seq)
        uniqueIdentifier, entryName, proteinName, organismName, geneName = \
            utils.parse_UniProtKB_header(rec.description)
        if geneName == '':
            geneName = uniqueIdentifier
            #print 'Using uncharacterized gene with identifier %s' % uniqueIdentifier
            #print 'Ignoring unknown gene: %s' % rec.description
            #continue
        if geneName in created_protein_names:
            duplicate_genes_ignored += 1
            #print 'Ignoring duplicate gene: %s' % rec.description
            continue
        # create a new Protein object
        created_protein_names.add(geneName)
        protein_seq[geneName] = seq
        all_proteins.append(Protein(geneName, seq, params.K))

    print
    print "Ignored %d duplicate genes." % duplicate_genes_ignored
    print "Counted k-mer (k=%d) for %d different genes (proteins)." % (
        params.K, len(all_proteins))

    pb = Progressbar('Generating frequency dictionary for k-mers')
    kmers_frequency = dict()  # track popularity of kmer accross all proteins
    i = 0
    for prot in all_proteins:
        i += 1
        pb.update_progress(i, len(all_proteins))

        for kmer in prot.kmers:
            if kmer not in kmers_frequency:
                kmers_frequency[kmer] = set()

            # add the new protein only if it's dissimilar enough from all other
            # proteins that were already added and contain this kmer
            protein_names = kmers_frequency[
                kmer]  # list of all prots that share this kmer
            redundantProt = False
            if similar_diluted:
                for protein_name in protein_names:
                    #print '%s: Checking similarity of %s and %s' % (kmer, protein_name, prot.geneName)
                    if not utils.proteins_are_dissimilar(
                            protein_name, prot.geneName,
                            protein_seq[protein_name], prot.seq):
                        redundantProt = True
                        break
            if not redundantProt:
                kmers_frequency[kmer].add(prot.geneName)
            redundantProt = False

    print "Sorting frequent k-mers by frequency..."
    most_frequenct_kmers = sorted(kmers_frequency,
                                  key=lambda k: len(kmers_frequency[k]),
                                  reverse=True)

    print "Writing results to file..."
    import datetime, time, csv
    timestamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H%M%S')
    filename_without_extension = os.path.splitext(
        os.path.basename(proteome_file))[0]
    dilution_status = 'with dilution' if similar_diluted else 'without dilution'
    outfile = "{0} - frequent k{1}-mers - {2} - {3}.csv".format(
        filename_without_extension, params.K, dilution_status, timestamp)
    outfile = os.path.join(os.path.dirname(proteome_file), outfile)
    with open(outfile, "wb") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow([
            'k-mer', 'number of proteins', 'percentage', 'all',
            '(Out of %d proteins in total)' % len(all_proteins)
        ])

        for kmer in most_frequenct_kmers:
            total_proteins = len(kmers_frequency[kmer])
            if total_proteins < 5:
                break
            percentage = round(float(total_proteins) / len(all_proteins), 6)
            geneList = list(kmers_frequency[kmer])
            #geneList = '\r\n'.join(geneList)
            row = [kmer, total_proteins, percentage, geneList]
            writer.writerow(row)
def main(proteome_file, output_dir):
    all_proteins = list()  # all_proteins[i] = PROTEIN()_OBJECT
    protein_seq = dict()  # protein_seq[GENE_NAME] = AMINO_ACID_SEQUENCE

    print "Reading Uniprot file and generating k-mers list for each protein..."
    created_protein_names = set(
    )  # prevent creation of two similar protein objects
    for rec in SeqIO.parse(open(proteome_file), 'fasta'):
        seq = str(rec.seq)
        uniqueIdentifier, entryName, proteinName, organismName, geneName = \
            utils.parse_UniProtKB_header(rec.description)
        if geneName == '':
            geneName = uniqueIdentifier
            #print 'Using uncharacterized gene with identifier %s' % uniqueIdentifier
            #print 'Ignoring unknown gene: %s' % rec.description
            #continue
        if geneName in created_protein_names:
            print 'Ignoring duplicate gene: %s' % rec.description
            continue
        # create a new Protein object
        created_protein_names.add(geneName)
        protein_seq[geneName] = seq
        all_proteins.append(Protein(geneName, seq, params.K))

    print
    print "Counted k-mer (k=%d) for %d different genes (proteins)." % (
        params.K, len(all_proteins))

    pb = Progressbar('Generating frequency dictionary for k-mers')
    skipped_prots = 0
    kmers_frequency = dict()  # track popularity of kmer accross all proteins
    i = 0
    for prot in all_proteins:
        i += 1
        pb.update_progress(i, len(all_proteins))
        """
        if prot.geneName.startswith('ZNF') or prot.geneName.startswith('ZF'):
            skipped_prots += 1
            continue
        if prot.geneName.startswith('OR'):
            skipped_prots += 1
            continue
        if prot.geneName.startswith('HOX'):
            skipped_prots += 1
            continue
        if prot.geneName.startswith('IGKV'):
            skipped_prots += 1
            continue
        """

        for kmer in prot.kmers:
            if kmer not in kmers_frequency:
                kmers_frequency[kmer] = set()
            kmers_frequency[kmer].add(prot.geneName)

    print "Sorting frequent k-mers by frequency..."
    most_frequenct_kmers = sorted(kmers_frequency,
                                  key=lambda k: len(kmers_frequency[k]),
                                  reverse=True)
    import datetime, time, csv
    timestamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H%M%S')
    outfile = '{}/frequent k{}-mers - {}.csv'.format(output_dir, params.K,
                                                     timestamp)
    with open(outfile, "wb") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow([
            'k-mer', 'number of proteins', 'percentage of total', 'all',
            '(Out of %d proteins in total)' %
            (len(all_proteins) - skipped_prots)
        ])

        for kmer in most_frequenct_kmers:
            total_proteins = len(kmers_frequency[kmer])
            if total_proteins < 10:
                break
            percentage = "{0:.4f}".format(
                float(total_proteins) / (len(all_proteins) - skipped_prots))
            geneList = list(kmers_frequency[kmer])
            #geneList = '\r\n'.join(geneList)
            row = [kmer, total_proteins, percentage, geneList]
            writer.writerow(row)