def data_collector(sample_id, sample_type, report_type, doc_type, req): if doc_type == 'html': #do stuff if req == "intro": return intro(sample_id, sample_type) if req == "name": return name_collector(sample_id, sample_type) if req == "gene": return feature_gene(sample_id, sample_type) if req == "stem-loop": return feature_stem_loop(sample_id, sample_type) if req == "peptide": return feature_peptide(sample_id, sample_type) if req == "cds": return feature_cds(sample_id, sample_type) if req == "source": return feature_source(sample_id, sample_type) if req == "comment": return comment(sample_id, sample_type) if req == "sequence": return chain_sequence(sample_id, sample_type) if req == "all": return soup_collector(sample_id, sample_type).text else: print('Invalid parameter supplied!') else: url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=" + sample_id + "&db=" + sample_type + "&report=" + report_type + "&extrafeat=null&conwithfeat=on&retmode=" + doc_type + "&tool=portal&maxdownloadsize=1000000" data = requests.get(url) soup = bs(data.content, 'html.parser') for script in soup(["script", "style"]): script.decompose() return soup, url
def stem(spl_id_dup, spl_type_dup): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) iterator = 0 while True: stem_loop = soup.find('span', {"id": "feature_" + sample_info_name + "_stem_loop_" + str(iterator)}) if stem_loop == None: break print(stem_loop.text) iterator += 1
def sequence(spl_id_dup, spl_type_dup): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) iterator = 1 while True: source = soup.find('span', {"id": sample_info_name + "_" + str(iterator)}) if source == None: break iterator += 60 print(iterator, source.text)
def gene(spl_id_dup, spl_type_dup): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) iterator = 0 while True: gene = soup.find('span', { "id": "feature_" + sample_info_name + "_gene_" + str(iterator) }) if gene == None: break print(gene.text) iterator += 1
def cds(spl_id_dup, spl_type_dup): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) iterator = 0 while True: cds = soup.find('span', { "id": "feature_" + sample_info_name + "_CDS_" + str(iterator) }) if cds == None: break print(cds.text) iterator += 1
def peptide(spl_id_dup, spl_type_dup): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) iterator = 0 while True: source = soup.find( 'span', { "id": "feature_" + sample_info_name + "_mat_peptide_" + str(iterator) }) if source == None: break print(source.text) iterator += 1
def intro(spl_id, spl_type): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) try: raw_intro = soup.findAll('a', {"name":'comment_' + sample_info_name})[0] intro7 = raw_intro.previousSibling intro6 = raw_intro.previousSibling.previousSibling intro5 = raw_intro.previousSibling.previousSibling.previousSibling intro4 = raw_intro.previousSibling.previousSibling.previousSibling.previousSibling intro3 = raw_intro.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling intro2 = raw_intro.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling intro1 = raw_intro.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling.previousSibling intro = str(intro1) + str(intro2) + str(intro3) + str(intro4) + str(intro5) + str(intro6) + str(intro7) return intro except: pass
def comment(spl_id, spl_type): soup = soup_collector(spl_id, spl_type) sample_info_name = name_collector(spl_id, spl_type) try: raw_intro = soup.find('a', {"href": 'https://www.ncbi.nlm.nih.gov/RefSeq/'}) intro7 = raw_intro.nextSibling intro6 = raw_intro.nextSibling.nextSibling #intro5 = raw_intro.nextSibling.nextSibling.nextSibling #intro4 = raw_intro.nextSibling.nextSibling.nextSibling.nextSibling #intro3 = raw_intro.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling #intro2 = raw_intro.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling #intro1 = raw_intro.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling intro = str(intro7) + str(intro6) return intro except: print('Oops something went wrong...')