Beispiel #1
0
def make_config(params):
    return {
        "dir": "ebola",
        "file_prefix": "ebola",
        "input_paths": ["../../fauna/data/ebola.fasta"],
        "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'url'},
        "filters": (
            ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2012,01,1).date()),
            ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()),
        ),
        "subsample": {
            "category": lambda x:(x.attributes['region'], x.attributes['date'].year, x.attributes['date'].month),
            "threshold": params.viruses_per_month,
            "priority": lambda x:x.id in forced_strains
        },
        "colors": ["country", "division"], # essential. Maybe False.
        "color_defs": ["./colors.tsv"],
        "lat_longs": ["country", "division"], # essential. Maybe False.
        "reference": {
            "path": "metadata/ebola_outgroup.gb",
            "metadata": {
                'strain': "reference", "accession": "KR075003", "date": "2014-XX-XX",
                'host': "human", 'country': "Liberia"
            },
            "include": 0,
            "genes": ['NP', 'VP35', 'VP40', 'GP', 'sGP', 'VP30', 'VP24', 'L']
        }
    }
def make_config(serotype, params):
    if params.file_prefix is not None:
        file_prefix = params.file_prefix
    else:
        file_prefix = "dengue_%s" % serotype

    if params.sequences is not None:
        input_paths = [params.sequences]
    elif os.path.isfile("../../../fauna/data/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta
        input_paths = ["../../../fauna/data/dengue_%s.fasta"%serotype]
    else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        input_paths = [select_serotype('../../../fauna/data/dengue_all.fasta', '../../../fauna/data/', serotype)]
        print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths))

    years_back = params.years_back
    time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]

    if params.titers is not None:
        if not os.path.isfile(params.titers):
            params.titers = '../../../fauna/data/%s'%params.titers
        titer_values, strains, sources = TiterCollection.load_from_file(params.titers)
    else:
        titer_values, strains, sources = None, None, None

    force_include = sanofi_vaccine_strains.values()

    config = {
        "dir": "dengue",
        "lineage": serotype,
        "title": "Real-time tracking of dengue evolution",
        "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"],
        "file_prefix": file_prefix,
        "input_paths": input_paths,
        "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country',
                        5:'division', 6: 'location', 7: 'authors', 8: 'url'},
        "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Bad Region", lambda s: any([
                                        s.attributes['region'] not in ['', ' ', '?'],
                                        s.attributes['accession'] in force_include,
                                        s.attributes['strain'] in force_include
                                        ]))),

        "subsample": dengue_subsampling(params, years_back, titer_values,
        force_include),

        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession" },

        "colors": ["authors", "region", "country"],
        "lat_longs": ["region", "country"],
        "auspice_filters": ["authors", "region", "country"],
        "reference": references[serotype],
        "time_interval": time_interval,
        "titers": titer_values,
        "strains": params.strains,
        "sources": sources
    }
    return config
Beispiel #3
0
def make_config(serotype, params):
    config = {
        "dir":
        "dengue",
        "file_prefix":
        "dengue_%s" % serotype,
        "input_paths":
        None,
        "header_fields": {
            0: 'strain',
            1: 'accession',
            2: 'date',
            3: 'region',
            4: 'country',
            5: 'division',
            6: 'location',
            7: 'authors',
            8: 'url'
        },
        "filters": (
            ("Dropped Strains",
             lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            # ("Sequence Length", lambda s: len(s.seq)>=5000),
            ("Bad Region", lambda s: s.attributes['region'] not in ['', '?'])),

        ### Make subsampling serotype specific?? Probably not?
        "subsample": {
            "category":
            lambda x: (x.attributes['region'], x.attributes['date'].year, x.
                       attributes['date'].month),
            "threshold":
            3
        },
        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession"
        },
        "colors": ["region"],
        "color_defs": ["./colors.tsv"],
        "lat_longs": ["region"],
        "lat_long_defs":
        '../../fauna/source-data/geo_lat_long.tsv',
        "reference":
        references[serotype]
    }

    if os.path.isfile(
            "../../fauna/data/dengue_%s.fasta" %
            serotype):  #is file: # Look for a serotype-specific fasta
        config["input_paths"] = ["../../fauna/data/dengue_%s.fasta" % serotype]
    else:  # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        config["input_paths"] = select_serotype(
            '../fauna/data/dengue_all.fasta', '../fauna/data/', serotype)
        print(
            'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'
            % (serotype, '../fauna/data/dengue.fasta', config["input_paths"]))

    return config
Beispiel #4
0
def make_config(serotype, params):
    if params.file_prefix is not None:
        file_prefix = params.file_prefix
    else:
        file_prefix = "dengue_%s" % serotype

    if params.sequences is not None:
        input_paths = [params.sequences]
    elif os.path.isfile("../../fauna/data/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta
        input_paths = ["../../fauna/data/dengue_%s.fasta"%serotype]
    else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        input_paths = [select_serotype('../fauna/data/dengue_all.fasta', '../fauna/data/', serotype)]
        print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths))

    years_back = params.years_back
    time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]

    if params.titers is not None:
        if not os.path.isfile(params.titers):
            params.titers = '../../fauna/data/%s'%params.titers
        titer_values, strains, sources = TiterModel.load_from_file(params.titers)
    else:
        titer_values, strains, sources = None, None, None

    config = {
        "dir": "dengue",
        "lineage": serotype,
        "title": "Genomic Epidemiology of Dengue Virus",
        "maintainer": ["@sidneymbell", "https://twitter.com/sidneymbell"],
        "file_prefix": file_prefix,
        "input_paths": input_paths,
        "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country',
                        5:'division', 6: 'location', 7: 'authors', 8: 'url'},
        "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            # ("Sequence Length", lambda s: len(s.seq)>=5000),
            ("Bad Region", lambda s: s.attributes['region'] not in ['', ' ', '?'])),

        "subsample": dengue_subsampling(params, years_back, titer_values),

        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession" },

        "colors": ["region"],
        "color_defs": "./colors.tsv",
        "lat_longs": ["region"],
        "lat_long_defs": '../../fauna/source-data/geo_lat_long.tsv',
        "reference": references[serotype],
        "time_interval": time_interval,
        "titers": titer_values,
        "sources": sources
    }
    return config
Beispiel #5
0
def make_config(params):
    dropped_strains = [
        "temara.MOR/24.03", "Mvs/Toulon.FRA/08.07" # clock is off
    ]
    filters = (
        ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
        ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(1950,01,1).date()),
        ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()),
        ("Sequence Length", lambda s: len(s.seq)>=5000),
        ("Number Ns", lambda s: s.seq.count('N')<=3000)
    )
    config = {
        "dir": "measles",
        "file_prefix": "measles",
        "title": "Real-time tracking of measles virus evolution",
        "maintainer": ["Trevor Bedford", "http://bedford.io/team/trevor-bedford/"],
        "input_paths": ["../../../fauna/data/measles.fasta"],
        "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
            6:'division', 8:'db', 10:'authors', 11:'url', 12:'title',
            13: 'journal', 14: 'paper_url'},
        "filters": filters,
        "subsample": {
            "threshold": params.viruses_per_month,
            "category": lambda x:(x.attributes['date'].year, x.attributes['date'].month, x.attributes['country'])
        },
        "colors": ["authors", "country", "region"],
        "color_defs": ["./colors.tsv"],
        "lat_longs": ["country", "region"],
        "auspice_filters": ["authors", "region", "country"],
        "reference": {
            "path": "measles-reference.gb",
            "metadata": {
                'strain': "Ichinose-B95a", "accession": "NC_001498.1", "date": "XXXX-XX-XX",
                'host': "human", 'country': "Unknown", 'region': "Unknown"
            },
            "include": 0,
            "genes": ['N', 'P', 'V', 'C', 'M', 'F', 'H', 'L']
        }
    }

    return config
Beispiel #6
0
 "input_paths": ["./data/mahar_RHDV.edit.fasta"],
 #>AUS/ACT/BLMT-3/2015|blmt-3|RHDV|MF421563.1|RHDV1_G2|2015-06-18|Australia|ACT|Mahar et al|Monitoring the init
 "header_fields": {
     0: 'strain',
     1: 'isolate',
     4: 'rhdv_strain',
     5: 'date',
     6: 'country',
     7: 'state',
     8: 'authors',
     10: 'journal',
     9: 'title'
 },
 "filters":
 (("Dropped Strains",
   lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
  # ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2012,01,1).date()),
  # ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()),
  #("Sequence Length", lambda s: len(s.seq)>=10000),
  ),
 "subsample": {
     "category":
     lambda x: (x.attributes['date'].year, x.attributes['date'].month),
 },
 "colors": ["state", "authors", "rhdv_strain"],
 # "color_defs": ["./colors.tsv"],
 "lat_longs": ["isolate"],
 "lat_long_defs":
 "./RHDV2_coords.edit.txt",
 "auspice_filters": ["state", "authors", "rhdv_strain"],
 "reference": {
Beispiel #7
0
def make_config(serotype, params):
    if params.file_prefix is not None:
        file_prefix = params.file_prefix
    else:
        file_prefix = "dengue_%s" % serotype

    if params.sequences is not None:
        input_paths = [params.sequences]
    elif os.path.isfile(
            "../../../fauna/data/dengue_%s.fasta" %
            serotype):  #is file: # Look for a serotype-specific fasta
        input_paths = ["../../../fauna/data/dengue_%s.fasta" % serotype]
    else:  # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        input_paths = [
            select_serotype('../../../fauna/data/dengue_all.fasta',
                            '../../../fauna/data/', serotype)
        ]
        print(
            'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'
            % (serotype, '../fauna/data/dengue.fasta', input_paths))

    years_back = params.years_back
    time_interval = [
        datetime.today().date(),
        (datetime.today() - timedelta(days=365.25 * years_back)).date()
    ]

    if params.titers is not None:
        if not os.path.isfile(params.titers):
            params.titers = '../../../fauna/data/%s' % params.titers
        titer_values, strains, sources = TiterCollection.load_from_file(
            params.titers)
    else:
        titer_values, strains, sources = None, None, None

    force_include = sanofi_vaccine_strains.values()

    config = {
        "dir":
        "dengue",
        "lineage":
        serotype,
        "title":
        "Real-time tracking of dengue evolution",
        "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"],
        "file_prefix":
        file_prefix,
        "input_paths":
        input_paths,
        "header_fields": {
            0: 'strain',
            1: 'accession',
            2: 'date',
            3: 'region',
            4: 'country',
            5: 'division',
            6: 'location',
            7: 'authors',
            8: 'url'
        },
        "filters":
        (("Dropped Strains",
          lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
         ("Bad Region", lambda s: any([
             s.attributes['region'] not in ['', ' ', '?'], s.attributes[
                 'accession'] in force_include, s.attributes['strain'] in
             force_include
         ]))),
        "subsample":
        dengue_subsampling(params, years_back, titer_values, force_include),
        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession"
        },
        "colors": ["authors", "region", "country"],
        "lat_longs": ["region", "country"],
        "auspice_filters": ["authors", "region", "country"],
        "reference":
        references[serotype],
        "time_interval":
        time_interval,
        "titers":
        titer_values,
        "strains":
        params.strains,
        "sources":
        sources
    }
    return config
Beispiel #8
0
def make_config(lineage, resolution, params):
    years_back = int(re.search("(\d+)", resolution).groups()[0])
    time_interval = [
        datetime.strptime(x, '%Y-%m-%d').date() for x in [
            "{:%Y-%m-%d}".format(datetime.today()), "{:%Y-%m-%d}".format(
                datetime.today() - timedelta(days=365.25 * years_back))
        ]
    ]
    reference_cutoff = date(year=time_interval[0].year - 3, month=1, day=1)

    return {
        "dir":
        "flu",
        "file_prefix":
        "flu_{}".format(lineage),
        "segments":
        params.segments,
        "resolution":
        resolution,
        "lineage":
        lineage,
        "input_paths": [
            "../../fauna/data/{}_{}.fasta".format(lineage, segment)
            for segment in params.segments
        ],
        #  0                     1   2         3          4      5     6       7       8          9                             10  11
        # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female
        "header_fields": {
            0: 'strain',
            2: 'isolate_id',
            3: 'date',
            4: 'region',
            5: 'country',
            6: 'division',
            8: 'passage',
            9: 'lab',
            10: 'age',
            11: 'gender'
        },
        "filters": (
            ("Time Interval", lambda s:
             (s.attributes['date'] <= time_interval[0] and s.attributes['date']
              >= time_interval[1]) or (s.name in reference_viruses[
                  lineage] and s.attributes['date'] > reference_cutoff)),
            ("Sequence Length", lambda s: len(s.seq) >= 900),
            # what's the order of evaluation here I wonder?
            ("Dropped Strains",
             lambda s: s.id not in [fix_names(x) for x in outliers[lineage]]),
            ("Bad geo info", lambda s: s.attributes["country"] != "?" and s.
             attributes["region"] != "?"),
        ),
        "subsample":
        flu_subsampling(params, years_back,
                        "../../fauna/data/{}_crick_hi".format(lineage)),
        "colors": ["region"],
        "color_defs": ["colors.flu.tsv"],
        "lat_longs": ["country", "region"],
        "lat_long_defs":
        '../../fauna/source-data/geo_lat_long.tsv',
        "references":
        {seg: reference_maps[lineage][seg]
         for seg in params.segments},
        "regions":
        regions,
        "time_interval":
        time_interval,
    }
Beispiel #9
0
def make_config(lineage, resolution, params):
    years_back = int(re.search("(\d+)", resolution).groups()[0])
    if params.time_interval:
        time_interval = sorted([datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval], reverse=True)
    else:
        time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]
    reference_cutoff = date(year = time_interval[1].year - 4, month=1, day=1)

    # Load and prepare outliers for the given lineage.
    with open("metadata/%s_outliers.txt" % lineage, "r") as fh:
        outliers = [outlier.rstrip() for outlier in fh]

    fixed_outliers = [fix_names(x) for x in outliers]
    fixed_references = [fix_names(x) for x in reference_viruses[lineage]]

    if params.titers is not None:
        titer_values, strains, sources = TiterCollection.load_from_file(params.titers)
    else:
        titer_values = None

    if params.sequences is not None:
        input_paths = params.sequences
    else:
        input_paths = ["../../../fauna/data/{}_{}.fasta".format(lineage, segment) for segment in params.segments]

    if params.file_prefix:
        file_prefix = params.file_prefix
    else:
        file_prefix = "flu_seasonal_{}_{}_{}".format(lineage, params.segments[0], resolution) # flu_seasonal_h3n2_ha_6y

    config = {
        "dir": "flu",
        "file_prefix": file_prefix,
        "title": make_title(lineage, resolution),
        "maintainer": ["Trevor Bedford and Barney Potter", "http://bedford.io/"],
        "auspice_filters": ["clade_membership", "region", "country"],
        "segments": params.segments,
        "ensure_all_segments": params.ensure_all_segments,
        "lineage": lineage,
        "resolution": resolution,
        "input_paths": input_paths,
        #  0                     1   2         3          4      5     6       7       8          9                             10  11
        # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female
        "header_fields": {
            0:'strain',  2:'isolate_id', 3:'date',
            4:'region',  5:'country',    6:'division',
            8:'passage', 9:'authors', 10:'age',
            11:'gender'
        },
        "filters": (
            ("Time Interval", lambda s:
                (s.attributes['date']<=time_interval[0] and s.attributes['date']>=time_interval[1]) or
                (s.name in fixed_references and s.attributes['date']>reference_cutoff)
            ),
            ("invalid chars", lambda s: sum([s.seq.count(c) for c in "EFIJKLOPQXYZ"])==0),
            ("Sequence Length", lambda s: len(s.seq)>=900),
            # what's the order of evaluation here I wonder?
            ("Dropped Strains", lambda s: s.id not in fixed_outliers),
            ("Bad geo info", lambda s: s.attributes["country"]!= "?" and s.attributes["region"]!= "?" ),
        ),
        "subsample": flu_subsampling(params, years_back, titer_values),
        "colors": ["region", "country"],
        "color_defs": ["colors.tsv"],
        "lat_longs": ["country", "region"],
        "references": {seg:reference_maps[lineage][seg] for seg in params.segments},
        "regions": regions,
        "time_interval": time_interval,
        "strains": params.strains,
        "titers": titer_values
    }

    ## VACCINES
    if lineage in vaccine_choices:
        config["vaccine_choices"] = vaccine_choices[lineage]
    else:
        print("WARNING. vaccine_choices are undefined for this lineage")

    ## LBI
    try:
        config["LBI_params"] = LBI_params[resolution]
    except:
        print("WARNING. LBI parameters are undefined for this resolution")

    ## FREQUENCIES
    try:
        config["frequency_params"] = frequency_params[resolution]
    except:
        print("WARNING. Frequency parameters are undefined for this resolution")

    return config;
Beispiel #10
0
def make_config(params):
    if params.geo == "global":
        file_prefix = "mumps_global"
        if params.viruses_per_month == 0:
            viruses_per_month = 3
        else:
            viruses_per_month = params.viruses_per_month
        dropped_strains = [
            "WA0268502_buccal/Washington.USA/16", # not yet released
            "Split.CRO/05.11/G",   # retracted sequence
            "9218/Zg98",   # retracted sequence
            "Zagreb.HRV/28.12/G",  # retracted sequence
            "Du/CRO05"   # retracted sequence
        ]
        colors = ["authors", "region", "country", "MuV_genotype"]
        lat_longs = ["country", "region"]
        auspice_filters = ["authors", "region", "country", "MuV_genotype"]
        filters = (
            ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(1950,01,1).date()),
            ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()),
            ("Sequence Length", lambda s: len(s.seq)>=5000),
            ("Number Ns", lambda s: s.seq.count('N')<=3000)
        )
    elif params.geo == "na":
        file_prefix = "mumps_na"
        if params.viruses_per_month == 0:
            viruses_per_month = 100
        else:
            viruses_per_month = params.viruses_per_month
        dropped_strains = [
            "Ontario.CAN/13.10/G", "Ontario.CAN/04.10/G", "Massachusetts.USA/37.16/1/G", "BritishColumbia.CAN/50.16/H",
            "BritishColumbia.CAN/22.16/1/G", "Mass.USA/4.10",
            "Virginia.USA/10.12/H", "BritishColumbia.CAN/33.16/3/G",
            "BritishColumbia.CAN/33.16/1/G", "BritishColumbia.CAN/9.17/A",
            "BritishColumbia.CAN/28.16/3/G",
            # all below are true strains, but group outside NA outbreak clade
            "WA0268502_buccal/Washington.USA/16", # not yet released
            "Washington.USA/2017217/8.17/3/G", # outlier. MRCA with other NA strains of 1990
            "BritishColumbia.CAN/34.16/2/F", #MuV genotype F. MRCA of 1943 (!)
            "Massachusetts.USA/24.17/5/K", #MuV genotype K
            "Massachusetts.USA/11.17/G" , "Massachusetts.USA/7.17/G", "Massachusetts.USA/9.17/G",
            "Massachusetts.USA/10.17/G","Massachusetts.USA/10.17/2/G","Massachusetts.USA/7.17/2/G",
            "Massachusetts.USA/13.17/G","Massachusetts.USA/12.17/G","Georgia.USA/2.17/G",
            "Massachusetts.USA/5.17/G","Massachusetts.USA/18.17/G","Massachusetts.USA/22.17/7/G",
            "Massachusetts.USA/23.17/2/G","Massachusetts.USA/19.17/2/G"
        ]
        colors = ["authors", "country", "division", "MuV_genotype"]
        lat_longs = ["country", "division"]
        auspice_filters = ["authors", "country", "division", "MuV_genotype"]
        filters = (
            ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Restrict Date Range", lambda s: s.attributes['date'] >= datetime(2009,01,1).date()),
            ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2020,01,1).date()),
            ("Sequence Length", lambda s: len(s.seq) >= 5000),
            ("Number Ns", lambda s: s.seq.count('N') <= 3000),
            ("Restrict Region", lambda s: s.attributes['region'] == 'north_america')
        )
    config = {
        "dir": "mumps",
        "file_prefix": file_prefix,
        "title": "Real-time tracking of mumps virus evolution",
        "maintainer": ["Louise Moncla", "http://bedford.io/team/louise-moncla/"],
        "input_paths": ["../../../fauna/data/mumps.fasta"],
        "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
            6:'division', 8:'db', 10:'authors', 11:'url', 12:'title',
            13: 'journal', 14: 'paper_url', 15: 'MuV_genotype'},
        "filters": filters,
        "subsample": {
            "threshold": viruses_per_month,
            "category": lambda x:(x.attributes['date'].year, x.attributes['date'].month, x.attributes['country'],x.attributes['MuV_genotype'])
        },
        "colors": colors,
        "color_defs": ["./colors.tsv"],
        "lat_longs": lat_longs,
        "auspice_filters": auspice_filters,
        "reference": {
            "path": "mumps-reference.gb",
            "metadata": {
                'strain': "MuV/Gabon/13/2", "accession": "KM597072.1", "date": "2013-03-01",
                'host': "human", 'country': "Gabon", 'region': "Gabon", 'MuV_genotype': "G"
            },
            "include": 0,
            "genes": ['NC', 'P', 'V', 'I', 'M', 'F', 'SH', 'HN', 'L']
        }
    }

    return config
Beispiel #11
0
def make_config(params):
    return {
        "dir":
        "lassa",
        "file_prefix":
        "lassa",
        "title":
        "Real-time tracking of Lassa virus evolution",
        "maintainer": ["Bedford Lab", "http://bedford.io/team/"],
        "input_paths": [
            "../../../flora/data/lassa_s.fasta",
            "../../../flora/data/lassa_l.fasta",
        ],
        "header_fields": {
            0: 'strain',
            1: 'accesion',
            2: 'segment',
            3: 'date',
            4: 'region',
            5: 'country',
            6: 'host_species',
            7: 'authors',
            8: 'title',
            9: 'journal',
            10: 'paper_url'
        },
        "filters": (
            ("Dropped Strains",
             lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Restrict Date Range for S segment", {
                "s": lambda s: True,
                "l": lambda s: True
            }),
            # ("Restrict Date Range", lambda s: s.attributes['date'] <= datetime(2018,01,1).date()),
            ("Sequence Length", {
                "s": lambda s: len(s.seq) >= 2500,
                "l": lambda s: len(s.seq) >= 5000,
            })),
        "subsample": {
            "category":
            lambda x: (x.attributes['country'], x.attributes['date'].year),
            "threshold":
            params.viruses_per_month,
            "priority":
            lambda x: x.id in forced_strains
        },
        "colors": ["country", "host_species"],
        "color_defs": ["./colors.tsv"],
        "lat_longs": ["country"],
        "auspice_filters": ["country", "authors", "host_species"],
        "references": {
            # references are pinneo strain. Same as Kristian's Cell paper.
            # Pinneo paper: http://jvi.asm.org/content/74/15/6992.long
            # Cell paper: http://www.cell.com/cell/pdfExtended/S0092-8674(15)00897-1
            "s": {
                "path": "metadata/lassa_s.gb",
                "metadata": {
                    'strain': "Nig08_04",
                    "accession": "GU481068",
                    "date": "2008-XX-XX",
                    'country': "nigeria",
                    'segment': 'S'
                },
                "include": 1,
                "genes": ['NP', 'GPC']
            },
            "l": {
                "path": "metadata/lassa_l.gb",
                "metadata": {
                    'strain': "Pinneo-NIG-1969",
                    "accession": "KM822127",
                    "date": "1969-XX-XX",
                    'country': "nigeria",
                    'segment': 'L'
                },
                "include": 1,
                "genes": ['Z', 'L']
            },
        }
    }
Beispiel #12
0
from __future__ import print_function
import os, sys
sys.path.append('..') # we assume (and assert) that this script is running from the virus directory, i.e. inside H7N9 or zika
from base.prepare import prepare
from datetime import datetime
from base.utils import fix_names
import argparse

def collect_args():
    parser = argparse.ArgumentParser(description = "Prepare fauna FASTA for analysis")
    return parser.parse_args()

dropped_strains = []

filters = {
    "dropped_strains": ("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
    "canada_only": ("Canada only", lambda s: s.attributes['country'] == "canada"),
    "exclude_BC": ("Exclude BC outbreak", lambda s: not s.attributes['accession'].startswith("BC_outbreak")),
    "Mass_only": ("Massachusetts only", lambda s: s.attributes['accession'].startswith("Massachusetts_outbreak")),
    "exclude_Mass": ("Exclude Massachusetts outbreak", lambda s: not s.attributes['accession'].startswith("Massachusetts_outbreak")),
    "unknown_country": ("Exclude unknown countries", lambda s: not s.attributes['country'].startswith("unknown"))
}

def make_config(context):
    config = {
        "dir": "mumps",
        "file_prefix": "mumps_%s"%context,
        "title": "Mumps virus (context: {}).format(context)",
        "maintainer": ["@LouiseHMoncla", "https://twitter.com/louisehmoncla"],
        "input_paths": ["../../fauna/data/mumps.fasta"],
        "header_fields": {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country',
Beispiel #13
0
def make_config(lineage, resolution, params):
    years_back = int(re.search("(\d+)", resolution).groups()[0])
    if params.time_interval:
        time_interval = sorted([
            datetime.strptime(x, '%Y-%m-%d').date()
            for x in params.time_interval
        ],
                               reverse=True)
    else:
        time_interval = [
            datetime.today().date(),
            (datetime.today() - timedelta(days=365.25 * years_back)).date()
        ]
    reference_cutoff = date(year=time_interval[1].year - 3, month=1, day=1)
    fixed_outliers = [fix_names(x) for x in outliers[lineage]]
    fixed_references = [fix_names(x) for x in reference_viruses[lineage]]

    if params.titers is not None:
        titer_values, strains, sources = TiterModel.load_from_file(
            params.titers)
    else:
        titer_values = None

    if params.sequences is not None:
        input_paths = params.sequences
    else:
        input_paths = [
            "../../fauna/data/{}_{}.fasta".format(lineage, segment)
            for segment in params.segment
        ]

    if params.file_prefix:
        file_prefix = params.file_prefix
    else:
        file_prefix = "flu_{}_{}_{}".format(lineage, params.segment[0],
                                            resolution)

    return {
        "dir":
        "flu",
        "file_prefix":
        file_prefix,
        "title":
        make_title(lineage, resolution),
        "maintainer": ["@trvrb", "https://twitter.com/trvrb"],
        "segments":
        params.segment,
        "lineage":
        lineage,
        "input_paths":
        input_paths,
        #  0                     1   2         3          4      5     6       7       8          9                             10  11
        # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female
        "header_fields": {
            0: 'strain',
            2: 'isolate_id',
            3: 'date',
            4: 'region',
            5: 'country',
            6: 'division',
            8: 'passage',
            9: 'lab',
            10: 'age',
            11: 'gender'
        },
        "filters": (
            ("Time Interval", lambda s:
             (s.attributes['date'] <= time_interval[0] and s.attributes['date']
              >= time_interval[1]) or (s.name in fixed_references and s.
                                       attributes['date'] > reference_cutoff)),
            ("Sequence Length", lambda s: len(s.seq) >= 900),
            # what's the order of evaluation here I wonder?
            ("Dropped Strains", lambda s: s.id not in fixed_outliers),
            ("Bad geo info", lambda s: s.attributes["country"] != "?" and s.
             attributes["region"] != "?"),
        ),
        "subsample":
        flu_subsampling(params, years_back, titer_values),
        "colors": ["region"],
        "color_defs": ["colors.flu.tsv"],
        "lat_longs": ["country", "region"],
        "lat_long_defs":
        '../../fauna/source-data/geo_lat_long.tsv',
        "references":
        {seg: reference_maps[lineage][seg]
         for seg in params.segment},
        "regions":
        regions,
        "time_interval":
        time_interval,
        "strains":
        params.strains,
        "titers":
        titer_values
    }