def make_config(params): return { "dir": "ebola", "file_prefix": "ebola", "input_paths": ["../../fauna/data/ebola.fasta"], "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'url'}, "filters": ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2012,01,1).date()), ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()), ), "subsample": { "category": lambda x:(x.attributes['region'], x.attributes['date'].year, x.attributes['date'].month), "threshold": params.viruses_per_month, "priority": lambda x:x.id in forced_strains }, "colors": ["country", "division"], # essential. Maybe False. "color_defs": ["./colors.tsv"], "lat_longs": ["country", "division"], # essential. Maybe False. "reference": { "path": "metadata/ebola_outgroup.gb", "metadata": { 'strain': "reference", "accession": "KR075003", "date": "2014-XX-XX", 'host': "human", 'country': "Liberia" }, "include": 0, "genes": ['NP', 'VP35', 'VP40', 'GP', 'sGP', 'VP30', 'VP24', 'L'] } }
def make_config(serotype, params): if params.file_prefix is not None: file_prefix = params.file_prefix else: file_prefix = "dengue_%s" % serotype if params.sequences is not None: input_paths = [params.sequences] elif os.path.isfile("../../../fauna/data/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta input_paths = ["../../../fauna/data/dengue_%s.fasta"%serotype] else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior) input_paths = [select_serotype('../../../fauna/data/dengue_all.fasta', '../../../fauna/data/', serotype)] print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths)) years_back = params.years_back time_interval = [datetime.today().date(), (datetime.today() - timedelta(days=365.25 * years_back)).date()] if params.titers is not None: if not os.path.isfile(params.titers): params.titers = '../../../fauna/data/%s'%params.titers titer_values, strains, sources = TiterCollection.load_from_file(params.titers) else: titer_values, strains, sources = None, None, None force_include = sanofi_vaccine_strains.values() config = { "dir": "dengue", "lineage": serotype, "title": "Real-time tracking of dengue evolution", "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"], "file_prefix": file_prefix, "input_paths": input_paths, "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country', 5:'division', 6: 'location', 7: 'authors', 8: 'url'}, "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Bad Region", lambda s: any([ s.attributes['region'] not in ['', ' ', '?'], s.attributes['accession'] in force_include, s.attributes['strain'] in force_include ]))), "subsample": dengue_subsampling(params, years_back, titer_values, force_include), "add_urls": { "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s", "attr": "accession" }, "colors": ["authors", "region", "country"], "lat_longs": ["region", "country"], "auspice_filters": ["authors", "region", "country"], "reference": references[serotype], "time_interval": time_interval, "titers": titer_values, "strains": params.strains, "sources": sources } return config
def make_config(serotype, params): config = { "dir": "dengue", "file_prefix": "dengue_%s" % serotype, "input_paths": None, "header_fields": { 0: 'strain', 1: 'accession', 2: 'date', 3: 'region', 4: 'country', 5: 'division', 6: 'location', 7: 'authors', 8: 'url' }, "filters": ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), # ("Sequence Length", lambda s: len(s.seq)>=5000), ("Bad Region", lambda s: s.attributes['region'] not in ['', '?'])), ### Make subsampling serotype specific?? Probably not? "subsample": { "category": lambda x: (x.attributes['region'], x.attributes['date'].year, x. attributes['date'].month), "threshold": 3 }, "add_urls": { "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s", "attr": "accession" }, "colors": ["region"], "color_defs": ["./colors.tsv"], "lat_longs": ["region"], "lat_long_defs": '../../fauna/source-data/geo_lat_long.tsv', "reference": references[serotype] } if os.path.isfile( "../../fauna/data/dengue_%s.fasta" % serotype): #is file: # Look for a serotype-specific fasta config["input_paths"] = ["../../fauna/data/dengue_%s.fasta" % serotype] else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior) config["input_paths"] = select_serotype( '../fauna/data/dengue_all.fasta', '../fauna/data/', serotype) print( 'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s' % (serotype, '../fauna/data/dengue.fasta', config["input_paths"])) return config
def make_config(serotype, params): if params.file_prefix is not None: file_prefix = params.file_prefix else: file_prefix = "dengue_%s" % serotype if params.sequences is not None: input_paths = [params.sequences] elif os.path.isfile("../../fauna/data/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta input_paths = ["../../fauna/data/dengue_%s.fasta"%serotype] else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior) input_paths = [select_serotype('../fauna/data/dengue_all.fasta', '../fauna/data/', serotype)] print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths)) years_back = params.years_back time_interval = [datetime.today().date(), (datetime.today() - timedelta(days=365.25 * years_back)).date()] if params.titers is not None: if not os.path.isfile(params.titers): params.titers = '../../fauna/data/%s'%params.titers titer_values, strains, sources = TiterModel.load_from_file(params.titers) else: titer_values, strains, sources = None, None, None config = { "dir": "dengue", "lineage": serotype, "title": "Genomic Epidemiology of Dengue Virus", "maintainer": ["@sidneymbell", "https://twitter.com/sidneymbell"], "file_prefix": file_prefix, "input_paths": input_paths, "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country', 5:'division', 6: 'location', 7: 'authors', 8: 'url'}, "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), # ("Sequence Length", lambda s: len(s.seq)>=5000), ("Bad Region", lambda s: s.attributes['region'] not in ['', ' ', '?'])), "subsample": dengue_subsampling(params, years_back, titer_values), "add_urls": { "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s", "attr": "accession" }, "colors": ["region"], "color_defs": "./colors.tsv", "lat_longs": ["region"], "lat_long_defs": '../../fauna/source-data/geo_lat_long.tsv', "reference": references[serotype], "time_interval": time_interval, "titers": titer_values, "sources": sources } return config
def make_config(params): dropped_strains = [ "temara.MOR/24.03", "Mvs/Toulon.FRA/08.07" # clock is off ] filters = ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(1950,01,1).date()), ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()), ("Sequence Length", lambda s: len(s.seq)>=5000), ("Number Ns", lambda s: s.seq.count('N')<=3000) ) config = { "dir": "measles", "file_prefix": "measles", "title": "Real-time tracking of measles virus evolution", "maintainer": ["Trevor Bedford", "http://bedford.io/team/trevor-bedford/"], "input_paths": ["../../../fauna/data/measles.fasta"], "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'url', 12:'title', 13: 'journal', 14: 'paper_url'}, "filters": filters, "subsample": { "threshold": params.viruses_per_month, "category": lambda x:(x.attributes['date'].year, x.attributes['date'].month, x.attributes['country']) }, "colors": ["authors", "country", "region"], "color_defs": ["./colors.tsv"], "lat_longs": ["country", "region"], "auspice_filters": ["authors", "region", "country"], "reference": { "path": "measles-reference.gb", "metadata": { 'strain': "Ichinose-B95a", "accession": "NC_001498.1", "date": "XXXX-XX-XX", 'host': "human", 'country': "Unknown", 'region': "Unknown" }, "include": 0, "genes": ['N', 'P', 'V', 'C', 'M', 'F', 'H', 'L'] } } return config
"input_paths": ["./data/mahar_RHDV.edit.fasta"], #>AUS/ACT/BLMT-3/2015|blmt-3|RHDV|MF421563.1|RHDV1_G2|2015-06-18|Australia|ACT|Mahar et al|Monitoring the init "header_fields": { 0: 'strain', 1: 'isolate', 4: 'rhdv_strain', 5: 'date', 6: 'country', 7: 'state', 8: 'authors', 10: 'journal', 9: 'title' }, "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), # ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2012,01,1).date()), # ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()), #("Sequence Length", lambda s: len(s.seq)>=10000), ), "subsample": { "category": lambda x: (x.attributes['date'].year, x.attributes['date'].month), }, "colors": ["state", "authors", "rhdv_strain"], # "color_defs": ["./colors.tsv"], "lat_longs": ["isolate"], "lat_long_defs": "./RHDV2_coords.edit.txt", "auspice_filters": ["state", "authors", "rhdv_strain"], "reference": {
def make_config(serotype, params): if params.file_prefix is not None: file_prefix = params.file_prefix else: file_prefix = "dengue_%s" % serotype if params.sequences is not None: input_paths = [params.sequences] elif os.path.isfile( "../../../fauna/data/dengue_%s.fasta" % serotype): #is file: # Look for a serotype-specific fasta input_paths = ["../../../fauna/data/dengue_%s.fasta" % serotype] else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior) input_paths = [ select_serotype('../../../fauna/data/dengue_all.fasta', '../../../fauna/data/', serotype) ] print( 'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s' % (serotype, '../fauna/data/dengue.fasta', input_paths)) years_back = params.years_back time_interval = [ datetime.today().date(), (datetime.today() - timedelta(days=365.25 * years_back)).date() ] if params.titers is not None: if not os.path.isfile(params.titers): params.titers = '../../../fauna/data/%s' % params.titers titer_values, strains, sources = TiterCollection.load_from_file( params.titers) else: titer_values, strains, sources = None, None, None force_include = sanofi_vaccine_strains.values() config = { "dir": "dengue", "lineage": serotype, "title": "Real-time tracking of dengue evolution", "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"], "file_prefix": file_prefix, "input_paths": input_paths, "header_fields": { 0: 'strain', 1: 'accession', 2: 'date', 3: 'region', 4: 'country', 5: 'division', 6: 'location', 7: 'authors', 8: 'url' }, "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Bad Region", lambda s: any([ s.attributes['region'] not in ['', ' ', '?'], s.attributes[ 'accession'] in force_include, s.attributes['strain'] in force_include ]))), "subsample": dengue_subsampling(params, years_back, titer_values, force_include), "add_urls": { "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s", "attr": "accession" }, "colors": ["authors", "region", "country"], "lat_longs": ["region", "country"], "auspice_filters": ["authors", "region", "country"], "reference": references[serotype], "time_interval": time_interval, "titers": titer_values, "strains": params.strains, "sources": sources } return config
def make_config(lineage, resolution, params): years_back = int(re.search("(\d+)", resolution).groups()[0]) time_interval = [ datetime.strptime(x, '%Y-%m-%d').date() for x in [ "{:%Y-%m-%d}".format(datetime.today()), "{:%Y-%m-%d}".format( datetime.today() - timedelta(days=365.25 * years_back)) ] ] reference_cutoff = date(year=time_interval[0].year - 3, month=1, day=1) return { "dir": "flu", "file_prefix": "flu_{}".format(lineage), "segments": params.segments, "resolution": resolution, "lineage": lineage, "input_paths": [ "../../fauna/data/{}_{}.fasta".format(lineage, segment) for segment in params.segments ], # 0 1 2 3 4 5 6 7 8 9 10 11 # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female "header_fields": { 0: 'strain', 2: 'isolate_id', 3: 'date', 4: 'region', 5: 'country', 6: 'division', 8: 'passage', 9: 'lab', 10: 'age', 11: 'gender' }, "filters": ( ("Time Interval", lambda s: (s.attributes['date'] <= time_interval[0] and s.attributes['date'] >= time_interval[1]) or (s.name in reference_viruses[ lineage] and s.attributes['date'] > reference_cutoff)), ("Sequence Length", lambda s: len(s.seq) >= 900), # what's the order of evaluation here I wonder? ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in outliers[lineage]]), ("Bad geo info", lambda s: s.attributes["country"] != "?" and s. attributes["region"] != "?"), ), "subsample": flu_subsampling(params, years_back, "../../fauna/data/{}_crick_hi".format(lineage)), "colors": ["region"], "color_defs": ["colors.flu.tsv"], "lat_longs": ["country", "region"], "lat_long_defs": '../../fauna/source-data/geo_lat_long.tsv', "references": {seg: reference_maps[lineage][seg] for seg in params.segments}, "regions": regions, "time_interval": time_interval, }
def make_config(lineage, resolution, params): years_back = int(re.search("(\d+)", resolution).groups()[0]) if params.time_interval: time_interval = sorted([datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval], reverse=True) else: time_interval = [datetime.today().date(), (datetime.today() - timedelta(days=365.25 * years_back)).date()] reference_cutoff = date(year = time_interval[1].year - 4, month=1, day=1) # Load and prepare outliers for the given lineage. with open("metadata/%s_outliers.txt" % lineage, "r") as fh: outliers = [outlier.rstrip() for outlier in fh] fixed_outliers = [fix_names(x) for x in outliers] fixed_references = [fix_names(x) for x in reference_viruses[lineage]] if params.titers is not None: titer_values, strains, sources = TiterCollection.load_from_file(params.titers) else: titer_values = None if params.sequences is not None: input_paths = params.sequences else: input_paths = ["../../../fauna/data/{}_{}.fasta".format(lineage, segment) for segment in params.segments] if params.file_prefix: file_prefix = params.file_prefix else: file_prefix = "flu_seasonal_{}_{}_{}".format(lineage, params.segments[0], resolution) # flu_seasonal_h3n2_ha_6y config = { "dir": "flu", "file_prefix": file_prefix, "title": make_title(lineage, resolution), "maintainer": ["Trevor Bedford and Barney Potter", "http://bedford.io/"], "auspice_filters": ["clade_membership", "region", "country"], "segments": params.segments, "ensure_all_segments": params.ensure_all_segments, "lineage": lineage, "resolution": resolution, "input_paths": input_paths, # 0 1 2 3 4 5 6 7 8 9 10 11 # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female "header_fields": { 0:'strain', 2:'isolate_id', 3:'date', 4:'region', 5:'country', 6:'division', 8:'passage', 9:'authors', 10:'age', 11:'gender' }, "filters": ( ("Time Interval", lambda s: (s.attributes['date']<=time_interval[0] and s.attributes['date']>=time_interval[1]) or (s.name in fixed_references and s.attributes['date']>reference_cutoff) ), ("invalid chars", lambda s: sum([s.seq.count(c) for c in "EFIJKLOPQXYZ"])==0), ("Sequence Length", lambda s: len(s.seq)>=900), # what's the order of evaluation here I wonder? ("Dropped Strains", lambda s: s.id not in fixed_outliers), ("Bad geo info", lambda s: s.attributes["country"]!= "?" and s.attributes["region"]!= "?" ), ), "subsample": flu_subsampling(params, years_back, titer_values), "colors": ["region", "country"], "color_defs": ["colors.tsv"], "lat_longs": ["country", "region"], "references": {seg:reference_maps[lineage][seg] for seg in params.segments}, "regions": regions, "time_interval": time_interval, "strains": params.strains, "titers": titer_values } ## VACCINES if lineage in vaccine_choices: config["vaccine_choices"] = vaccine_choices[lineage] else: print("WARNING. vaccine_choices are undefined for this lineage") ## LBI try: config["LBI_params"] = LBI_params[resolution] except: print("WARNING. LBI parameters are undefined for this resolution") ## FREQUENCIES try: config["frequency_params"] = frequency_params[resolution] except: print("WARNING. Frequency parameters are undefined for this resolution") return config;
def make_config(params): if params.geo == "global": file_prefix = "mumps_global" if params.viruses_per_month == 0: viruses_per_month = 3 else: viruses_per_month = params.viruses_per_month dropped_strains = [ "WA0268502_buccal/Washington.USA/16", # not yet released "Split.CRO/05.11/G", # retracted sequence "9218/Zg98", # retracted sequence "Zagreb.HRV/28.12/G", # retracted sequence "Du/CRO05" # retracted sequence ] colors = ["authors", "region", "country", "MuV_genotype"] lat_longs = ["country", "region"] auspice_filters = ["authors", "region", "country", "MuV_genotype"] filters = ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(1950,01,1).date()), ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()), ("Sequence Length", lambda s: len(s.seq)>=5000), ("Number Ns", lambda s: s.seq.count('N')<=3000) ) elif params.geo == "na": file_prefix = "mumps_na" if params.viruses_per_month == 0: viruses_per_month = 100 else: viruses_per_month = params.viruses_per_month dropped_strains = [ "Ontario.CAN/13.10/G", "Ontario.CAN/04.10/G", "Massachusetts.USA/37.16/1/G", "BritishColumbia.CAN/50.16/H", "BritishColumbia.CAN/22.16/1/G", "Mass.USA/4.10", "Virginia.USA/10.12/H", "BritishColumbia.CAN/33.16/3/G", "BritishColumbia.CAN/33.16/1/G", "BritishColumbia.CAN/9.17/A", "BritishColumbia.CAN/28.16/3/G", # all below are true strains, but group outside NA outbreak clade "WA0268502_buccal/Washington.USA/16", # not yet released "Washington.USA/2017217/8.17/3/G", # outlier. MRCA with other NA strains of 1990 "BritishColumbia.CAN/34.16/2/F", #MuV genotype F. MRCA of 1943 (!) "Massachusetts.USA/24.17/5/K", #MuV genotype K "Massachusetts.USA/11.17/G" , "Massachusetts.USA/7.17/G", "Massachusetts.USA/9.17/G", "Massachusetts.USA/10.17/G","Massachusetts.USA/10.17/2/G","Massachusetts.USA/7.17/2/G", "Massachusetts.USA/13.17/G","Massachusetts.USA/12.17/G","Georgia.USA/2.17/G", "Massachusetts.USA/5.17/G","Massachusetts.USA/18.17/G","Massachusetts.USA/22.17/7/G", "Massachusetts.USA/23.17/2/G","Massachusetts.USA/19.17/2/G" ] colors = ["authors", "country", "division", "MuV_genotype"] lat_longs = ["country", "division"] auspice_filters = ["authors", "country", "division", "MuV_genotype"] filters = ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2009,01,1).date()), ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()), ("Sequence Length", lambda s: len(s.seq) >= 5000), ("Number Ns", lambda s: s.seq.count('N') <= 3000), ("Restrict Region", lambda s: s.attributes['region'] == 'north_america') ) config = { "dir": "mumps", "file_prefix": file_prefix, "title": "Real-time tracking of mumps virus evolution", "maintainer": ["Louise Moncla", "http://bedford.io/team/louise-moncla/"], "input_paths": ["../../../fauna/data/mumps.fasta"], "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'url', 12:'title', 13: 'journal', 14: 'paper_url', 15: 'MuV_genotype'}, "filters": filters, "subsample": { "threshold": viruses_per_month, "category": lambda x:(x.attributes['date'].year, x.attributes['date'].month, x.attributes['country'],x.attributes['MuV_genotype']) }, "colors": colors, "color_defs": ["./colors.tsv"], "lat_longs": lat_longs, "auspice_filters": auspice_filters, "reference": { "path": "mumps-reference.gb", "metadata": { 'strain': "MuV/Gabon/13/2", "accession": "KM597072.1", "date": "2013-03-01", 'host': "human", 'country': "Gabon", 'region': "Gabon", 'MuV_genotype': "G" }, "include": 0, "genes": ['NC', 'P', 'V', 'I', 'M', 'F', 'SH', 'HN', 'L'] } } return config
def make_config(params): return { "dir": "lassa", "file_prefix": "lassa", "title": "Real-time tracking of Lassa virus evolution", "maintainer": ["Bedford Lab", "http://bedford.io/team/"], "input_paths": [ "../../../flora/data/lassa_s.fasta", "../../../flora/data/lassa_l.fasta", ], "header_fields": { 0: 'strain', 1: 'accesion', 2: 'segment', 3: 'date', 4: 'region', 5: 'country', 6: 'host_species', 7: 'authors', 8: 'title', 9: 'journal', 10: 'paper_url' }, "filters": ( ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), ("Restrict Date Range for S segment", { "s": lambda s: True, "l": lambda s: True }), # ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()), ("Sequence Length", { "s": lambda s: len(s.seq) >= 2500, "l": lambda s: len(s.seq) >= 5000, })), "subsample": { "category": lambda x: (x.attributes['country'], x.attributes['date'].year), "threshold": params.viruses_per_month, "priority": lambda x: x.id in forced_strains }, "colors": ["country", "host_species"], "color_defs": ["./colors.tsv"], "lat_longs": ["country"], "auspice_filters": ["country", "authors", "host_species"], "references": { # references are pinneo strain. Same as Kristian's Cell paper. # Pinneo paper: http://jvi.asm.org/content/74/15/6992.long # Cell paper: http://www.cell.com/cell/pdfExtended/S0092-8674(15)00897-1 "s": { "path": "metadata/lassa_s.gb", "metadata": { 'strain': "Nig08_04", "accession": "GU481068", "date": "2008-XX-XX", 'country': "nigeria", 'segment': 'S' }, "include": 1, "genes": ['NP', 'GPC'] }, "l": { "path": "metadata/lassa_l.gb", "metadata": { 'strain': "Pinneo-NIG-1969", "accession": "KM822127", "date": "1969-XX-XX", 'country': "nigeria", 'segment': 'L' }, "include": 1, "genes": ['Z', 'L'] }, } }
from __future__ import print_function import os, sys sys.path.append('..') # we assume (and assert) that this script is running from the virus directory, i.e. inside H7N9 or zika from base.prepare import prepare from datetime import datetime from base.utils import fix_names import argparse def collect_args(): parser = argparse.ArgumentParser(description = "Prepare fauna FASTA for analysis") return parser.parse_args() dropped_strains = [] filters = { "dropped_strains": ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]), "canada_only": ("Canada only", lambda s: s.attributes['country'] == "canada"), "exclude_BC": ("Exclude BC outbreak", lambda s: not s.attributes['accession'].startswith("BC_outbreak")), "Mass_only": ("Massachusetts only", lambda s: s.attributes['accession'].startswith("Massachusetts_outbreak")), "exclude_Mass": ("Exclude Massachusetts outbreak", lambda s: not s.attributes['accession'].startswith("Massachusetts_outbreak")), "unknown_country": ("Exclude unknown countries", lambda s: not s.attributes['country'].startswith("unknown")) } def make_config(context): config = { "dir": "mumps", "file_prefix": "mumps_%s"%context, "title": "Mumps virus (context: {}).format(context)", "maintainer": ["@LouiseHMoncla", "https://twitter.com/louisehmoncla"], "input_paths": ["../../fauna/data/mumps.fasta"], "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
def make_config(lineage, resolution, params): years_back = int(re.search("(\d+)", resolution).groups()[0]) if params.time_interval: time_interval = sorted([ datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval ], reverse=True) else: time_interval = [ datetime.today().date(), (datetime.today() - timedelta(days=365.25 * years_back)).date() ] reference_cutoff = date(year=time_interval[1].year - 3, month=1, day=1) fixed_outliers = [fix_names(x) for x in outliers[lineage]] fixed_references = [fix_names(x) for x in reference_viruses[lineage]] if params.titers is not None: titer_values, strains, sources = TiterModel.load_from_file( params.titers) else: titer_values = None if params.sequences is not None: input_paths = params.sequences else: input_paths = [ "../../fauna/data/{}_{}.fasta".format(lineage, segment) for segment in params.segment ] if params.file_prefix: file_prefix = params.file_prefix else: file_prefix = "flu_{}_{}_{}".format(lineage, params.segment[0], resolution) return { "dir": "flu", "file_prefix": file_prefix, "title": make_title(lineage, resolution), "maintainer": ["@trvrb", "https://twitter.com/trvrb"], "segments": params.segment, "lineage": lineage, "input_paths": input_paths, # 0 1 2 3 4 5 6 7 8 9 10 11 # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female "header_fields": { 0: 'strain', 2: 'isolate_id', 3: 'date', 4: 'region', 5: 'country', 6: 'division', 8: 'passage', 9: 'lab', 10: 'age', 11: 'gender' }, "filters": ( ("Time Interval", lambda s: (s.attributes['date'] <= time_interval[0] and s.attributes['date'] >= time_interval[1]) or (s.name in fixed_references and s. attributes['date'] > reference_cutoff)), ("Sequence Length", lambda s: len(s.seq) >= 900), # what's the order of evaluation here I wonder? ("Dropped Strains", lambda s: s.id not in fixed_outliers), ("Bad geo info", lambda s: s.attributes["country"] != "?" and s. attributes["region"] != "?"), ), "subsample": flu_subsampling(params, years_back, titer_values), "colors": ["region"], "color_defs": ["colors.flu.tsv"], "lat_longs": ["country", "region"], "lat_long_defs": '../../fauna/source-data/geo_lat_long.tsv', "references": {seg: reference_maps[lineage][seg] for seg in params.segment}, "regions": regions, "time_interval": time_interval, "strains": params.strains, "titers": titer_values }