def print_acc2gi(accessions): """Print GIs corresponding to the given accession numbers.""" term = ' OR '.join(a + '[accn]' for a in accessions) for line in entrez.on_search(db='nucleotide', term=term, tool='summary'): if 'Name="Extra"' in line and any(a in line for a in accessions): _, gi, _, acc, _ = line.split('|', 4) print('%18s -> %s' % (acc, gi))
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-s', '--sra', metavar='SRAid', help='SRA identifier') args = parser.parse_args() for line in entrez.on_search(db='sra', term=args.sra, tool='summary'): if 'Name="Runs"' in line: acc = re.search('acc=\"(?P<acc>\w+[0-9]+)\"', line).group('acc') print(acc)
def print_acc2gi(accessions): """Print GIs corresponding to the given accession numbers.""" term = ' OR '.join(a + '[accn]' for a in accessions) for line in entrez.on_search(db='nucleotide', term=term, tool='summary'): if 'Name="Extra"' in line and any(a in line for a in accessions): gi = re.search('gi\|([0-9]+)\|', line).group(1) acc = re.search('((emb)|(gb)|(ref)|(dbj))\|(?P<acc>\w+\.[0-9]+)\|', line).group('acc') print('%18s -> %s' % (acc, gi))
def application_3(): """Retrieving large datasets. Download all chimpanzee mRNA sequences in FASTA format (>50,000 sequences). """ query = 'chimpanzee[orgn] AND biomol mrna[prop]' with open('chimp.fna', 'w') as fout: for line in entrez.on_search(db='nucleotide', term=query, tool='fetch', rettype='fasta'): fout.write(line + '\n')
def application_3(): """Sample Application 3: Retrieving large datasets Download all chimpanzee mRNA sequences in FASTA format (>50,000 sequences). """ query = 'chimpanzee[orgn] AND biomol mrna[prop]' with open('chimp.fna', 'w') as fout: for line in ez.on_search(db='nucleotide', term=query, tool='fetch', rettype='fasta'): fout.write(line + '\n') print('The results are in file chimp.fna.')
def application_2(): """Converting accession numbers to data. Starting with a list of protein accession numbers, return the sequences in FASTA format. """ # Input: comma-delimited list of accessions. accs = 'NM_009417,NM_000547,NM_001003009,NM_019353'.split(',') query = ' OR '.join(a + '[accn]' for a in accs) # Output: FASTA data. for line in entrez.on_search(db='nucleotide', term=query, tool='fetch', db2='protein', rettype='fasta'): print(line)
def application_2(): """Sample Application 2: Converting accession numbers to data Starting with a list of protein accession numbers, return the sequences in FASTA format. """ # Input: comma-delimited list of accessions. accs = 'NM_009417,NM_000547,NM_001003009,NM_019353'.split(',') query = ' OR '.join(a + '[accn]' for a in accs) # Output: FASTA data. for line in ez.on_search(db='nuccore', term=query, tool='fetch', rettype='fasta'): print(line)