Esempio n. 1
0
def populate_organism():
    def add_organism(name, accession):
        # get the object, this also checks for duplicates
        o, created = Organism.objects.get_or_create(name=name,
                                                    accession=accession)
        return o

    def merge_acc_names(accession_list):
        acc_name_dict = {}
        db = "nuccore"
        # Doing batches of 200 to make sure requests to NCBI are not too big
        for i in range(0, len(accession_list), 200):
            j = i + 200

            result_handle = Entrez.efetch(db=db,
                                          rettype="gb",
                                          id=accession_list[i:j])

            # Populate result per organism name
            records = SeqIO.parse(result_handle, 'genbank')
            for record in tqdm(records):
                # Using NCBI name, which should match accession number passed
                acc_name_dict[record.name] = record.annotations['organism']
        return acc_name_dict

    with open(os.path.join(DATA_DIR, 'bac_accession_list.txt')) as f:
        acc_name_dict = list(read_accession_file(f))

    # acc_name_dict = merge_acc_names(accession_list)
    for acc in acc_name_dict:
        add_organism(name=acc_name_dict[acc], accession=acc)
Esempio n. 2
0
def populate_organism():
    def add_organism(name, accession):
        # get the object, this also checks for duplicates
        o, created = Organism.objects.get_or_create(
            name=name, accession=accession)
        return o

    def merge_acc_names(accession_list):
        acc_name_dict = {}
        db = "nuccore"
        # Doing batches of 200 to make sure requests to NCBI are not too big
        for i in range(0, len(accession_list), 200):
            j = i + 200

            result_handle = Entrez.efetch(
                db=db, rettype="gb", id=accession_list[i:j])

            # Populate result per organism name
            records = SeqIO.parse(result_handle, 'genbank')
            for record in tqdm(records):
                # Using NCBI name, which should match accession number passed
                acc_name_dict[record.name] = record.annotations['organism']
        return acc_name_dict

    with open(os.path.join(DATA_DIR, 'bac_accession_list.txt')) as f:
        acc_name_dict = list(read_accession_file(f))

    # acc_name_dict = merge_acc_names(accession_list)
    for acc in acc_name_dict:
        add_organism(name=acc_name_dict[acc], accession=acc)
Esempio n. 3
0
def populate_anticrispr():
    with open(os.path.join(DATA_DIR, 'antiCRISPR_accessions.txt')) as f:
        accession_list = list(read_accession_file(f))
    print("Fetching AntiCRISPR entries")
    result_handle = Entrez.efetch(db='protein',
                                  rettype="fasta",
                                  id=accession_list)
    for record in tqdm(SeqIO.parse(result_handle, 'fasta')):
        spacer, _ = AntiCRISPR.objects.get_or_create(accession=record.name,
                                                     sequence=str(record.seq))
        spacer.save()
Esempio n. 4
0
def populate_anticrispr():
    with open(os.path.join(DATA_DIR, 'antiCRISPR_accessions.txt')) as f:
        accession_list = list(read_accession_file(f))
    print("Fetching AntiCRISPR entries")
    result_handle = Entrez.efetch(
        db='protein', rettype="fasta", id=accession_list)
    for record in tqdm(SeqIO.parse(result_handle, 'fasta')):
        spacer, _ = AntiCRISPR.objects.get_or_create(
            accession=record.name,
            sequence=str(record.seq))
        spacer.save()
Esempio n. 5
0
def test_read_accession_file_check_formatting():
    f = io.StringIO("""
# comment on left
123left
                        345right
        987middle              
        # comment in middle, with trailing space

        # ^^ blank line
""")
    assert list(read_accession_file(f)) == [
        '123left',
        '345right',
        '987middle',
    ]
Esempio n. 6
0
def test_read_accession_file_empty():
    assert list(read_accession_file(io.StringIO(""))) == []