def populate_organism(): def add_organism(name, accession): # get the object, this also checks for duplicates o, created = Organism.objects.get_or_create(name=name, accession=accession) return o def merge_acc_names(accession_list): acc_name_dict = {} db = "nuccore" # Doing batches of 200 to make sure requests to NCBI are not too big for i in range(0, len(accession_list), 200): j = i + 200 result_handle = Entrez.efetch(db=db, rettype="gb", id=accession_list[i:j]) # Populate result per organism name records = SeqIO.parse(result_handle, 'genbank') for record in tqdm(records): # Using NCBI name, which should match accession number passed acc_name_dict[record.name] = record.annotations['organism'] return acc_name_dict with open(os.path.join(DATA_DIR, 'bac_accession_list.txt')) as f: acc_name_dict = list(read_accession_file(f)) # acc_name_dict = merge_acc_names(accession_list) for acc in acc_name_dict: add_organism(name=acc_name_dict[acc], accession=acc)
def populate_organism(): def add_organism(name, accession): # get the object, this also checks for duplicates o, created = Organism.objects.get_or_create( name=name, accession=accession) return o def merge_acc_names(accession_list): acc_name_dict = {} db = "nuccore" # Doing batches of 200 to make sure requests to NCBI are not too big for i in range(0, len(accession_list), 200): j = i + 200 result_handle = Entrez.efetch( db=db, rettype="gb", id=accession_list[i:j]) # Populate result per organism name records = SeqIO.parse(result_handle, 'genbank') for record in tqdm(records): # Using NCBI name, which should match accession number passed acc_name_dict[record.name] = record.annotations['organism'] return acc_name_dict with open(os.path.join(DATA_DIR, 'bac_accession_list.txt')) as f: acc_name_dict = list(read_accession_file(f)) # acc_name_dict = merge_acc_names(accession_list) for acc in acc_name_dict: add_organism(name=acc_name_dict[acc], accession=acc)
def populate_anticrispr(): with open(os.path.join(DATA_DIR, 'antiCRISPR_accessions.txt')) as f: accession_list = list(read_accession_file(f)) print("Fetching AntiCRISPR entries") result_handle = Entrez.efetch(db='protein', rettype="fasta", id=accession_list) for record in tqdm(SeqIO.parse(result_handle, 'fasta')): spacer, _ = AntiCRISPR.objects.get_or_create(accession=record.name, sequence=str(record.seq)) spacer.save()
def populate_anticrispr(): with open(os.path.join(DATA_DIR, 'antiCRISPR_accessions.txt')) as f: accession_list = list(read_accession_file(f)) print("Fetching AntiCRISPR entries") result_handle = Entrez.efetch( db='protein', rettype="fasta", id=accession_list) for record in tqdm(SeqIO.parse(result_handle, 'fasta')): spacer, _ = AntiCRISPR.objects.get_or_create( accession=record.name, sequence=str(record.seq)) spacer.save()
def test_read_accession_file_check_formatting(): f = io.StringIO(""" # comment on left 123left 345right 987middle # comment in middle, with trailing space # ^^ blank line """) assert list(read_accession_file(f)) == [ '123left', '345right', '987middle', ]
def test_read_accession_file_empty(): assert list(read_accession_file(io.StringIO(""))) == []