Example #1
0
import os, re, time, datetime, csv, sys, json
from upload import upload
import rethinkdb as r
from Bio import SeqIO
import argparse
from parse import parse
from upload import parser
sys.path.append('')  # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload

parser.add_argument('--assay_type', default='hi', help='type of assay being recorded')

class nimr_upload(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.assay_type = kwargs['assay_type']
        self.assay_date = None

    def format_measurements(self, measurements, **kwargs):
        '''
        format virus information in preparation to upload to database table
        '''
        self.fix_whole_name = self.define_strain_fixes(self.strain_fix_fname)
        self.fix_whole_name.update(self.define_strain_fixes(self.HI_strain_fix_fname))
        self.HI_ref_name_abbrev =self.define_strain_fixes(self.HI_ref_name_abbrev_fname)
        self.define_location_fixes("source-data/flu_fix_location_label.tsv")
        self.define_countries("source-data/geo_synonyms.tsv")
        for meas in measurements:
            meas['virus_strain'], meas['original_virus_strain'] = self.fix_name(self.HI_fix_name(meas['virus_strain'], serum=False))
            meas['serum_strain'], meas['original_serum_strain'] = self.fix_name(self.HI_fix_name(meas['serum_strain'], serum=True))
Example #2
0
import os, re, time, datetime, csv, sys, json, errno
from upload import upload
import rethinkdb as r
from Bio import SeqIO
import argparse
import subprocess
from parse import parse
from upload import parser
sys.path.append('')  # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
import logging
logger = logging.getLogger()

parser.add_argument('--assay_type', default='hi')


def read_vidrl(path, fstem, assay_type):
    '''
    Read all csv tables in path, create data frame with reference viruses as columns
    '''
    fname = path + fstem + ".csv"
    # import glob
    # flist = glob.glob(path + '/NIMR*csv') #BP
    exten = [
        os.path.isfile(path + fstem + ext)
        for ext in ['.xls', '.xlsm', '.xlsx']
    ]
    if True in exten:
        ind = exten.index(True)
        convert_xls_to_csv(path, fstem, ind)
Example #3
0
import os, re, time, datetime, csv, sys, json
import numpy as np
import rethinkdb as r
from Bio import SeqIO
from Bio import AlignIO
from upload import upload
from upload import parser
from unidecode import unidecode

parser.add_argument('--upload_directory', default=False, action="store_true", help='upload all xls and fasta files in directory')
parser.add_argument('--vtype', default=None, help="type of virus, if applicable")
parser.add_argument('--subtype', default=None, help="subtype of virus")
parser.add_argument('--lineage', default=None, help="lineage of virus")

class flu_upload(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.grouping_upload_fields = ['vtype', 'subtype', 'lineage']
        # patterns from the subtype and lineage fields in the GISAID fasta file
        self.patterns = {('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'),
                    ('a / h1n2', ''): ('a', 'h1n2', None),
                    ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'),
                    ('a / h2n2', ''): ('a', 'h2n2', None),
                    ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n2', 'seasonal'): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n3', ''): ('a', 'h3n3', None),
                    ('a / h5n1', ''): ('a', 'h5n1', None),
                    ('a / h5n6', ''): ('a', 'h5n6', None),
                    ('a / h6n1', ''): ('a', 'h6n1', None),
                    ('a / h7n1', ''): ('a', 'h7n1', None),
                    ('a / h7n2', ''): ('a', 'h7n2', None),
Example #4
0
import os, re, time, datetime, csv, sys, json
import numpy as np
import rethinkdb as r
from Bio import SeqIO
from Bio import AlignIO
from upload import upload
from upload import parser
from unidecode import unidecode

parser.add_argument('--upload_directory',
                    default=False,
                    action="store_true",
                    help='upload all xls and fasta files in directory')
parser.add_argument('--vtype',
                    default=None,
                    help="type of virus, if applicable")
parser.add_argument('--subtype', default=None, help="subtype of virus")
parser.add_argument('--lineage', default=None, help="lineage of virus")


class flu_upload(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.grouping_upload_fields = ['vtype', 'subtype', 'lineage']
        # patterns from the subtype and lineage fields in the GISAID fasta file
        self.patterns = {
            ('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'),
            ('a / h1n2', ''): ('a', 'h1n2', None),
            ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'),
            ('a / h2n2', ''): ('a', 'h2n2', None),
            ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
Example #5
0
import os, json
import rethinkdb as r
from upload import upload
from upload import parser

parser.add_argument('--update_citations',
                    default=False,
                    action="store_true",
                    help="update citation fields")
parser.add_argument('--update_locations',
                    default=False,
                    action="store_true",
                    help="update location fields")
parser.add_argument('--update_groupings',
                    default=False,
                    action="store_true",
                    help="update grouping fields")


class update(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.location_fields = ['location', 'division', 'country', 'region']

    def update(self, update_citations, update_locations, update_groupings,
               **kwargs):
        self.connect(**kwargs)
        if update_citations:
            self.update_citations(table=self.sequences_table, **kwargs)
        elif update_locations:
            self.update_locations(table=self.viruses_table, **kwargs)
Example #6
0
import os, re, time, datetime, csv, sys, json
from upload import upload
from rethinkdb import r
from Bio import SeqIO
import argparse
from parse import parse
from upload import parser

sys.path.append('')  # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload

parser.add_argument("--rename", action='store_true')


class cdc_upload(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.removal_fields = [
            'tested_by_fra', 'reported_by_fra', 'date',
            'virus_collection_date', 'ref', 'virus_harvest_date', 'Boosted',
            'RBC_species'
        ]
        self.cleanup_fields = {
            'assay-type': 'assay_type',
            'lot #': 'lot_number'
        }

    def upload(self, ftype='flat', preview=False, **kwargs):
        '''
        format virus information, then upload to database
Example #7
0
import os, json
import rethinkdb as r
from upload import upload
from upload import parser

parser.add_argument('--update_citations', default=False, action="store_true", help="update citation fields")
parser.add_argument('--update_locations', default=False, action="store_true", help="update location fields")
parser.add_argument('--update_groupings', default=False, action="store_true", help="update grouping fields")

class update(upload):
    def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.location_fields = ['location', 'division', 'country']

    def update(self, update_citations, update_locations, update_groupings, **kwargs):
        self.connect(**kwargs)
        if update_citations:
            self.update_citations(table=self.sequences_table, **kwargs)
        elif update_locations:
            self.update_locations(table=self.viruses_table, **kwargs)
        elif update_groupings:
            self.update_groupings(self.viruses_table, self.sequences_table, **kwargs)
        else:
            self.update_genbank_documents(**kwargs)

    def update_citations(self, database, table, preview, index='accession', **kwargs):
        print("Updating citation fields")
        _, sequences = self.get_genbank_sequences(**kwargs)
        self.format_sequences(sequences, **kwargs)
        self.match_duplicate_accessions(sequences, **kwargs)
        self.match_database_duplicate_accessions(sequences, virus=self.virus, database=database)