import os, re, time, datetime, csv, sys, json from upload import upload import rethinkdb as r from Bio import SeqIO import argparse from parse import parse from upload import parser sys.path.append('') # need to import from base from base.rethink_io import rethink_io from vdb.flu_upload import flu_upload parser.add_argument('--assay_type', default='hi', help='type of assay being recorded') class nimr_upload(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.assay_type = kwargs['assay_type'] self.assay_date = None def format_measurements(self, measurements, **kwargs): ''' format virus information in preparation to upload to database table ''' self.fix_whole_name = self.define_strain_fixes(self.strain_fix_fname) self.fix_whole_name.update(self.define_strain_fixes(self.HI_strain_fix_fname)) self.HI_ref_name_abbrev =self.define_strain_fixes(self.HI_ref_name_abbrev_fname) self.define_location_fixes("source-data/flu_fix_location_label.tsv") self.define_countries("source-data/geo_synonyms.tsv") for meas in measurements: meas['virus_strain'], meas['original_virus_strain'] = self.fix_name(self.HI_fix_name(meas['virus_strain'], serum=False)) meas['serum_strain'], meas['original_serum_strain'] = self.fix_name(self.HI_fix_name(meas['serum_strain'], serum=True))
import os, re, time, datetime, csv, sys, json, errno from upload import upload import rethinkdb as r from Bio import SeqIO import argparse import subprocess from parse import parse from upload import parser sys.path.append('') # need to import from base from base.rethink_io import rethink_io from vdb.flu_upload import flu_upload import logging logger = logging.getLogger() parser.add_argument('--assay_type', default='hi') def read_vidrl(path, fstem, assay_type): ''' Read all csv tables in path, create data frame with reference viruses as columns ''' fname = path + fstem + ".csv" # import glob # flist = glob.glob(path + '/NIMR*csv') #BP exten = [ os.path.isfile(path + fstem + ext) for ext in ['.xls', '.xlsm', '.xlsx'] ] if True in exten: ind = exten.index(True) convert_xls_to_csv(path, fstem, ind)
import os, re, time, datetime, csv, sys, json import numpy as np import rethinkdb as r from Bio import SeqIO from Bio import AlignIO from upload import upload from upload import parser from unidecode import unidecode parser.add_argument('--upload_directory', default=False, action="store_true", help='upload all xls and fasta files in directory') parser.add_argument('--vtype', default=None, help="type of virus, if applicable") parser.add_argument('--subtype', default=None, help="subtype of virus") parser.add_argument('--lineage', default=None, help="lineage of virus") class flu_upload(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.grouping_upload_fields = ['vtype', 'subtype', 'lineage'] # patterns from the subtype and lineage fields in the GISAID fasta file self.patterns = {('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'), ('a / h1n2', ''): ('a', 'h1n2', None), ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'), ('a / h2n2', ''): ('a', 'h2n2', None), ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'), ('a / h3n2', 'seasonal'): ('a', 'h3n2', 'seasonal_h3n2'), ('a / h3n3', ''): ('a', 'h3n3', None), ('a / h5n1', ''): ('a', 'h5n1', None), ('a / h5n6', ''): ('a', 'h5n6', None), ('a / h6n1', ''): ('a', 'h6n1', None), ('a / h7n1', ''): ('a', 'h7n1', None), ('a / h7n2', ''): ('a', 'h7n2', None),
import os, re, time, datetime, csv, sys, json import numpy as np import rethinkdb as r from Bio import SeqIO from Bio import AlignIO from upload import upload from upload import parser from unidecode import unidecode parser.add_argument('--upload_directory', default=False, action="store_true", help='upload all xls and fasta files in directory') parser.add_argument('--vtype', default=None, help="type of virus, if applicable") parser.add_argument('--subtype', default=None, help="subtype of virus") parser.add_argument('--lineage', default=None, help="lineage of virus") class flu_upload(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.grouping_upload_fields = ['vtype', 'subtype', 'lineage'] # patterns from the subtype and lineage fields in the GISAID fasta file self.patterns = { ('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'), ('a / h1n2', ''): ('a', 'h1n2', None), ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'), ('a / h2n2', ''): ('a', 'h2n2', None), ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
import os, json import rethinkdb as r from upload import upload from upload import parser parser.add_argument('--update_citations', default=False, action="store_true", help="update citation fields") parser.add_argument('--update_locations', default=False, action="store_true", help="update location fields") parser.add_argument('--update_groupings', default=False, action="store_true", help="update grouping fields") class update(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.location_fields = ['location', 'division', 'country', 'region'] def update(self, update_citations, update_locations, update_groupings, **kwargs): self.connect(**kwargs) if update_citations: self.update_citations(table=self.sequences_table, **kwargs) elif update_locations: self.update_locations(table=self.viruses_table, **kwargs)
import os, re, time, datetime, csv, sys, json from upload import upload from rethinkdb import r from Bio import SeqIO import argparse from parse import parse from upload import parser sys.path.append('') # need to import from base from base.rethink_io import rethink_io from vdb.flu_upload import flu_upload parser.add_argument("--rename", action='store_true') class cdc_upload(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.removal_fields = [ 'tested_by_fra', 'reported_by_fra', 'date', 'virus_collection_date', 'ref', 'virus_harvest_date', 'Boosted', 'RBC_species' ] self.cleanup_fields = { 'assay-type': 'assay_type', 'lot #': 'lot_number' } def upload(self, ftype='flat', preview=False, **kwargs): ''' format virus information, then upload to database
import os, json import rethinkdb as r from upload import upload from upload import parser parser.add_argument('--update_citations', default=False, action="store_true", help="update citation fields") parser.add_argument('--update_locations', default=False, action="store_true", help="update location fields") parser.add_argument('--update_groupings', default=False, action="store_true", help="update grouping fields") class update(upload): def __init__(self, **kwargs): upload.__init__(self, **kwargs) self.location_fields = ['location', 'division', 'country'] def update(self, update_citations, update_locations, update_groupings, **kwargs): self.connect(**kwargs) if update_citations: self.update_citations(table=self.sequences_table, **kwargs) elif update_locations: self.update_locations(table=self.viruses_table, **kwargs) elif update_groupings: self.update_groupings(self.viruses_table, self.sequences_table, **kwargs) else: self.update_genbank_documents(**kwargs) def update_citations(self, database, table, preview, index='accession', **kwargs): print("Updating citation fields") _, sequences = self.get_genbank_sequences(**kwargs) self.format_sequences(sequences, **kwargs) self.match_duplicate_accessions(sequences, **kwargs) self.match_database_duplicate_accessions(sequences, virus=self.virus, database=database)