Beispiel #1
0
def make_config(serotype, params):
    if params.file_prefix is not None:
        file_prefix = params.file_prefix
    else:
        file_prefix = "dengue_%s" % serotype

    if params.sequences is not None:
        input_paths = params.sequences
    elif os.path.isfile("../../../../data/sequences/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta
        input_paths = ["../../../../data/sequences/dengue_%s.fasta"%serotype]
    else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        input_paths = [select_serotype('../../../../data/sequences/dengue_all.fasta', '../../../../data/sequences/', serotype)]
        print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths))

    years_back = params.years_back
    time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]

    if params.titers is not None:
        if not os.path.isfile(params.titers):
            params.titers = '../../../../data/titers_katzelnick2015/%s'%params.titers
        titer_values, strains, sources = TiterCollection.load_from_file(params.titers, excluded_sources=['agm_1mo', 'agm_5mo'])
        print (sources)
    else:
        titer_values, strains, sources = None, None, None

    force_include = sanofi_vaccine_strains.values()

    config = {
        "dir": "dengue",
        "lineage": serotype,
        "title": "Real-time tracking of dengue evolution",
        "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"],
        "file_prefix": file_prefix,
        "input_paths": input_paths,
        "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country',
                        5:'division', 6: 'location', 7: 'authors', 8: 'url'},
        "filters": (),#("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            # ("Bad Region", lambda s: any([
            #                             s.attributes['region'] not in ['', ' ', '?'],
            #                             s.attributes['accession'] in force_include,
            #                             s.attributes['strain'] in force_include
            #                             ]))),

        "subsample": dengue_subsampling(params, years_back, titer_values,
        force_include),

        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession" },

        "colors": ["authors", "region", "country"],
        "lat_longs": ["region", "country"],
        "auspice_filters": ["authors", "region", "country"],
        "reference": references[serotype],
        "time_interval": time_interval,
        "titers": titer_values,
        "strains": params.strains,
        "sources": sources
    }
    return config
def dengue_subsampling(params, years_back, titer_values, force_include=[]):
    ### Category: bin on region, year, month
    category = lambda x: (x.attributes['region'], x.attributes['date'].year, x.
                          attributes['date'].month)

    ### Priority: # titer measurements, # unambiguous sites
    if titer_values is not None:
        titer_count = TiterCollection.count_strains(titer_values)

        def priority(seq):
            strain = seq.attributes['strain']
            accession = seq.attributes['accession']

            if strain in force_include or accession in force_include:
                return 10000

            if strain in titer_count:
                pr = titer_count[strain]
            else:
                pr = 0
            return pr + len(seq.seq) * 0.00005 - 0.01 * np.sum(
                [seq.seq.count(nuc) for nuc in 'NRWYMKSHBVD'])

    else:
        print("Couldn't load titer information - using random priorities")

        def priority(seq):
            strain = seq.attributes['strain']
            accession = seq.attributes['accession']
            if strain in force_include or accession in force_include:
                return 10000
            else:
                return np.random.random()

    return {"category": category, "priority": priority, "threshold": 3}
def make_config(serotype, params):
    if params.file_prefix is not None:
        file_prefix = params.file_prefix
    else:
        file_prefix = "dengue_%s" % serotype

    if params.sequences is not None:
        input_paths = [params.sequences]
    elif os.path.isfile("../../../fauna/data/dengue_%s.fasta"%serotype): #is file: # Look for a serotype-specific fasta
        input_paths = ["../../../fauna/data/dengue_%s.fasta"%serotype]
    else: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior)
        input_paths = [select_serotype('../../../fauna/data/dengue_all.fasta', '../../../fauna/data/', serotype)]
        print('WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s'%(serotype, '../fauna/data/dengue.fasta', input_paths))

    years_back = params.years_back
    time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]

    if params.titers is not None:
        if not os.path.isfile(params.titers):
            params.titers = '../../../fauna/data/%s'%params.titers
        titer_values, strains, sources = TiterCollection.load_from_file(params.titers)
    else:
        titer_values, strains, sources = None, None, None

    force_include = sanofi_vaccine_strains.values()

    config = {
        "dir": "dengue",
        "lineage": serotype,
        "title": "Real-time tracking of dengue evolution",
        "maintainer": ["Sidney Bell", "http://bedford.io/team/sidney-bell/"],
        "file_prefix": file_prefix,
        "input_paths": input_paths,
        "header_fields": {0:'strain', 1:'accession', 2:'date', 3:'region', 4:'country',
                        5:'division', 6: 'location', 7: 'authors', 8: 'url'},
        "filters": (("Dropped Strains", lambda s: s.id not in [fix_names(x) for x in dropped_strains]),
            ("Bad Region", lambda s: any([
                                        s.attributes['region'] not in ['', ' ', '?'],
                                        s.attributes['accession'] in force_include,
                                        s.attributes['strain'] in force_include
                                        ]))),

        "subsample": dengue_subsampling(params, years_back, titer_values,
        force_include),

        "add_urls": {
            "prefix": "https://www.ncbi.nlm.nih.gov/nuccore/%s",
            "attr": "accession" },

        "colors": ["authors", "region", "country"],
        "lat_longs": ["region", "country"],
        "auspice_filters": ["authors", "region", "country"],
        "reference": references[serotype],
        "time_interval": time_interval,
        "titers": titer_values,
        "strains": params.strains,
        "sources": sources
    }
    return config
def dengue_subsampling(params, years_back, titer_values, force_include = []):
    ### Category: bin on region, year, month
    category = lambda x: (x.attributes['region'],
                          x.attributes['date'].year,
                          x.attributes['date'].month)

    ### Priority: # titer measurements, # unambiguous sites
    if titer_values is not None:
        titer_count = TiterCollection.count_strains(titer_values)

        def priority(seq):
            strain = seq.attributes['strain']
            accession = seq.attributes['accession']

            if strain in force_include or accession in force_include:
                return 10000

            if strain in titer_count:
                pr = titer_count[strain]
            else:
                pr = 0
            return pr + len(seq.seq)*0.00005 - 0.01*np.sum([seq.seq.count(nuc) for nuc in 'NRWYMKSHBVD'])

    else:
        print("Couldn't load titer information - using random priorities")
        def priority(seq):
            strain = seq.attributes['strain']
            accession = seq.attributes['accession']
            if strain in force_include or accession in force_include:
                return 10000
            else:
                return np.random.random()

    return {
        "category": category,
        "priority": priority,
        "threshold": 3
    }
Beispiel #5
0
    def write_json(self, fh, config, prefix):
        # datetime() objects and [arrays] don't go to JSONs
        # not a problem - we still have raw_date to get them back
        for seq in self.seqs.values():
            if 'date' in seq.attributes:
                del seq.attributes['date']
            if 'num_date' in seq.attributes:
                del seq.attributes['num_date']

        data = self.extras
        data["info"] = {
            "n(starting)": self.nstart,
            "n(final)": len(self.seqs),
            # "commit": git.Repo(search_parent_directories=True).head.object.hexsha,
            "date_format": config["date_format"],
            "subsampled": bool(config["subsample"]),
            "traits_are_dates": [],
            "title": config["title"],
            "maintainer": config["maintainer"],
            "auspice_filters": config["auspice_filters"]
        }
        if "traits_are_dates" in config and isinstance(config["traits_are_dates"], (list, tuple)):
            data["info"]["traits_are_dates"] = [trait for trait in config["traits_are_dates"] if trait in config["header_fields"].values()]
        data["info"]["prefix"] = prefix
        if self.segmentName == "genome":
            data["info"]["input_file"] = config["input_paths"][0]
        else:
            data["info"]["input_file"] = config["input_paths"][config["segments"].index(self.segmentName)]
        if "time_interval" in config:
            data["info"]["time_interval"] = [str(x) for x in config["time_interval"]]
        potentially_combine(config, data["info"], "regions")
        potentially_combine(config, data["info"], "lineage", False)
        data["info"]["segment"] = self.segmentName
        potentially_combine(config, data["info"], "resolution", False)
        data["sequences"] = {}
        for seqName, seq in self.seqs.iteritems():
            data["sequences"][seqName] = {
                "attributes": seq.attributes,
                "seq": str(seq.seq)
            }
        if not self.reference:
            data["reference"] = None
        else:
            data["reference"] = {
                # "attributes": self.reference.attributes,
                "strain": self.reference.attributes["strain"],
                "seq": str(self.reference.seq),
                "genes": self.reference.genes,
                "included": self.reference.name in self.seqs
            }

        # Titers must be present in the config and not None to be used.
        if config.get("titers") is not None:
            # Subset titer data to match the strains selected for export.
            filtered_titers = TiterCollection.filter_strains(config["titers"], self.seqs.keys())

            # Convert tuple dictionary keys to strings for JSON compatability.
            data["titers"] = {str(key): value
                              for key, value in filtered_titers.iteritems()}
            logger.debug("Filtered titers from %i to %i measures" % (len(config["titers"]), len(data["titers"])))

        # Flu-specific elements...
        if "vaccine_choices" in config and config["vaccine_choices"] is not None:
            data["info"]["vaccine_choices"] = {}
            for k, v in config["vaccine_choices"].items():
                if k in self.extras["leaves"]:
                    data["info"]["vaccine_choices"][k] = v
                else:
                    print("WARNING! Vaccine strain {} was not present in the data".format(k))
        if "LBI_params" in config:
            data["info"]["LBI_params"] = config["LBI_params"]
        if "frequency_params" in config:
            data["info"]["frequency_params"] = config["frequency_params"]

        json.dump(data, fh, indent=2)
Beispiel #6
0
def flu_subsampling(params, years_back, titer_values):
    if params.sampling == "even":
        type_of_subsampling = "even"
    elif params.sampling in [x[0] for x in regions]:
        type_of_subsampling = "priority"
    else:
        type_of_subsampling = "flat"

    #### DEFINE THE CATEGORY:
    if type_of_subsampling in ["even", "priority"]:
        category = lambda x: (x.attributes['region'], x.attributes['date'].
                              year, x.attributes['date'].month)
    else:
        category = lambda x: (x.attributes['date'].year, x.attributes['date'].
                              month)

    #### DEFINE THE PRIORITY
    if titer_values is not None:
        HI_titer_count = TiterCollection.count_strains(titer_values)
    else:
        print("Couldn't load titer information - using random priorities")
        HI_titer_count = False

        def priority(seq):
            return np.random.random() + int(
                seq.name in reference_viruses[params.lineage])

    if HI_titer_count:

        def priority(seq):
            sname = seq.attributes['strain']
            if sname in HI_titer_count:
                pr = HI_titer_count[sname]
            else:
                pr = 0
            return (
                pr + len(seq.seq) * 0.0001 -
                0.01 * np.sum([seq.seq.count(nuc) for nuc in 'NRWYMKSHBVD']) +
                1e6 * int(seq.name in reference_viruses[params.lineage]))

    ##### DEFINE THE THRESHOLD
    if params.viruses_per_month != 0:
        sampling_threshold = params.viruses_per_month
    else:
        sampling_threshold = vpm_dict[years_back]

    region_threshold = int(np.ceil(1.0 * sampling_threshold / len(regions)))
    if type_of_subsampling == "priority":
        priority_region = params.sampling
    if type_of_subsampling == "even":

        def threshold(obj):
            """
            a higher order function which returns a fn which has access to
            some summary stats about the sequences (closure)
            """
            sequence_count_total, sequence_count_region = populate_counts(obj)

            def threshold_fn(x):
                #x is the collection key, in this case a tuple of (region, year, month)
                if sequence_count_total[(x[1], x[2])] < sampling_threshold:
                    return sampling_threshold
                region_counts = sorted([
                    sequence_count_region[(r[0], x[1], x[2])] for r in regions
                ])
                if region_counts[0] > region_threshold:
                    return region_threshold
                left_to_fill = sampling_threshold - len(
                    regions) * region_counts[0]
                thres = region_counts[0]
                for ri, rc in zip(range(len(regions) - 1, 0, -1),
                                  region_counts[1:]):
                    if left_to_fill - ri * (rc - thres) > 0:
                        left_to_fill -= ri * (rc - thres)
                        thres = rc
                    else:
                        thres += left_to_fill / ri
                        break
                return max(1, int(thres))

            return threshold_fn
    elif type_of_subsampling == "priority":
        priority_region = params.sampling
        fraction = 0.5

        def threshold(obj):
            """
            a higher order function which returns a fn which has access to
            some summary stats about the sequences (closure)
            """
            sequence_count_total, sequence_count_region = populate_counts(obj)

            def threshold_fn(x):
                #x is the collection key, in this case a tuple of (region, year, month)
                if x[0] == priority_region:
                    return int(sampling_threshold * fraction)
                nregions = len(regions) - 1
                total_threshold_world = sampling_threshold * (1 - fraction)
                region_threshold = int(
                    np.ceil(1.0 * total_threshold_world / nregions))
                region_counts = sorted([
                    sequence_count_region[(r[0], x[1], x[2])] for r in regions
                    if r != priority_region
                ])
                if region_counts[0] > region_threshold:
                    return region_threshold
                else:
                    left_to_fill = total_threshold_world - nregions * region_counts[
                        0]
                    thres = region_counts[0]
                    for ri, rc in zip(range(nregions - 1, 0, -1),
                                      region_counts[1:]):
                        if left_to_fill - ri * (rc - thres) > 0:
                            left_to_fill -= ri * (rc - thres)
                            thres = rc
                        else:
                            thres += left_to_fill / ri
                            break
                    return max(1, int(thres))

            return threshold_fn
    else:  # flat subsampling
        threshold = lambda x: sampling_threshold

    return {"category": category, "priority": priority, "threshold": threshold}
Beispiel #7
0
def make_config(lineage, resolution, params):
    years_back = int(re.search("(\d+)", resolution).groups()[0])
    if params.time_interval:
        time_interval = sorted([datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval], reverse=True)
    else:
        time_interval = [datetime.today().date(), (datetime.today()  - timedelta(days=365.25 * years_back)).date()]
    reference_cutoff = date(year = time_interval[1].year - 4, month=1, day=1)

    # Load and prepare outliers for the given lineage.
    with open("metadata/%s_outliers.txt" % lineage, "r") as fh:
        outliers = [outlier.rstrip() for outlier in fh]

    fixed_outliers = [fix_names(x) for x in outliers]
    fixed_references = [fix_names(x) for x in reference_viruses[lineage]]

    if params.titers is not None:
        titer_values, strains, sources = TiterCollection.load_from_file(params.titers)
    else:
        titer_values = None

    if params.sequences is not None:
        input_paths = params.sequences
    else:
        input_paths = ["../../../fauna/data/{}_{}.fasta".format(lineage, segment) for segment in params.segments]

    if params.file_prefix:
        file_prefix = params.file_prefix
    else:
        file_prefix = "flu_seasonal_{}_{}_{}".format(lineage, params.segments[0], resolution) # flu_seasonal_h3n2_ha_6y

    config = {
        "dir": "flu",
        "file_prefix": file_prefix,
        "title": make_title(lineage, resolution),
        "maintainer": ["Trevor Bedford and Barney Potter", "http://bedford.io/"],
        "auspice_filters": ["clade_membership", "region", "country"],
        "segments": params.segments,
        "ensure_all_segments": params.ensure_all_segments,
        "lineage": lineage,
        "resolution": resolution,
        "input_paths": input_paths,
        #  0                     1   2         3          4      5     6       7       8          9                             10  11
        # >A/Galicia/RR9542/2012|flu|EPI376225|2012-02-23|europe|spain|galicia|galicia|unpassaged|instituto_de_salud_carlos_iii|47y|female
        "header_fields": {
            0:'strain',  2:'isolate_id', 3:'date',
            4:'region',  5:'country',    6:'division',
            8:'passage', 9:'authors', 10:'age',
            11:'gender'
        },
        "filters": (
            ("Time Interval", lambda s:
                (s.attributes['date']<=time_interval[0] and s.attributes['date']>=time_interval[1]) or
                (s.name in fixed_references and s.attributes['date']>reference_cutoff)
            ),
            ("invalid chars", lambda s: sum([s.seq.count(c) for c in "EFIJKLOPQXYZ"])==0),
            ("Sequence Length", lambda s: len(s.seq)>=900),
            # what's the order of evaluation here I wonder?
            ("Dropped Strains", lambda s: s.id not in fixed_outliers),
            ("Bad geo info", lambda s: s.attributes["country"]!= "?" and s.attributes["region"]!= "?" ),
        ),
        "subsample": flu_subsampling(params, years_back, titer_values),
        "colors": ["region", "country"],
        "color_defs": ["colors.tsv"],
        "lat_longs": ["country", "region"],
        "references": {seg:reference_maps[lineage][seg] for seg in params.segments},
        "regions": regions,
        "time_interval": time_interval,
        "strains": params.strains,
        "titers": titer_values
    }

    ## VACCINES
    if lineage in vaccine_choices:
        config["vaccine_choices"] = vaccine_choices[lineage]
    else:
        print("WARNING. vaccine_choices are undefined for this lineage")

    ## LBI
    try:
        config["LBI_params"] = LBI_params[resolution]
    except:
        print("WARNING. LBI parameters are undefined for this resolution")

    ## FREQUENCIES
    try:
        config["frequency_params"] = frequency_params[resolution]
    except:
        print("WARNING. Frequency parameters are undefined for this resolution")

    return config;
def flu_subsampling(params, years_back, titer_values):
    if params.sampling == "even":
        type_of_subsampling = "even"
    elif params.sampling in [x[0] for x in regions]:
        type_of_subsampling = "priority"
    else:
        type_of_subsampling = "flat"

    #### DEFINE THE CATEGORY:
    if type_of_subsampling in ["even", "priority"]:
        category = lambda x: (x.attributes['region'],
                              x.attributes['date'].year,
                              x.attributes['date'].month)
    else:
        category = lambda x: (x.attributes['date'].year,
                              x.attributes['date'].month)

    #### DEFINE THE PRIORITY
    if titer_values is not None:
        HI_titer_count = TiterCollection.count_strains(titer_values)
    else:
        print("Couldn't load titer information - using random priorities")
        HI_titer_count = False
        def priority(seq):
            return np.random.random() + int(seq.name in reference_viruses[params.lineage])
    if HI_titer_count:
        def priority(seq):
            sname = seq.attributes['strain']
            if sname in HI_titer_count:
                pr = HI_titer_count[sname]
            else:
                pr = 0
            return (pr + len(seq.seq)*0.0001 - 0.01*np.sum([seq.seq.count(nuc) for nuc in 'NRWYMKSHBVD']) +
                    1e6*int(seq.name in reference_viruses[params.lineage]))

    ##### DEFINE THE THRESHOLD
    if params.viruses_per_month != 0:
        sampling_threshold = params.viruses_per_month
    else:
        sampling_threshold = vpm_dict[years_back]

    region_threshold = int(np.ceil(1.0*sampling_threshold/len(regions)))
    if type_of_subsampling == "priority":
        priority_region = params.sampling
    if type_of_subsampling == "even":
        def threshold(obj):
            """
            a higher order function which returns a fn which has access to
            some summary stats about the sequences (closure)
            """
            sequence_count_total, sequence_count_region = populate_counts(obj)
            def threshold_fn(x):
                #x is the collection key, in this case a tuple of (region, year, month)
                if sequence_count_total[(x[1], x[2])] < sampling_threshold:
                    return sampling_threshold
                region_counts = sorted([sequence_count_region[(r[0], x[1], x[2])] for r in regions])
                if region_counts[0] > region_threshold:
                    return region_threshold
                left_to_fill = sampling_threshold - len(regions)*region_counts[0]
                thres = region_counts[0]
                for ri, rc in zip(range(len(regions)-1, 0, -1), region_counts[1:]):
                    if left_to_fill - ri*(rc-thres)>0:
                        left_to_fill-=ri*(rc-thres)
                        thres = rc
                    else:
                        thres += left_to_fill/ri
                        break
                return max(1, int(thres))
            return threshold_fn
    elif type_of_subsampling == "priority":
        priority_region = params.sampling
        fraction = 0.5
        def threshold(obj):
            """
            a higher order function which returns a fn which has access to
            some summary stats about the sequences (closure)
            """
            sequence_count_total, sequence_count_region = populate_counts(obj)
            def threshold_fn(x):
                #x is the collection key, in this case a tuple of (region, year, month)
                if x[0]==priority_region:
                    return int(sampling_threshold*fraction)
                nregions = len(regions)-1
                total_threshold_world = sampling_threshold*(1-fraction)
                region_threshold = int(np.ceil(1.0*total_threshold_world/nregions))
                region_counts = sorted([sequence_count_region[(r[0], x[1], x[2])]
                                        for r in regions if r!=priority_region])
                if region_counts[0]>region_threshold:
                    return region_threshold
                else:
                    left_to_fill = total_threshold_world - nregions*region_counts[0]
                    thres = region_counts[0]
                    for ri, rc in zip(range(nregions-1, 0, -1), region_counts[1:]):
                        if left_to_fill - ri*(rc-thres)>0:
                            left_to_fill-=ri*(rc-thres)
                            thres = rc
                        else:
                            thres += left_to_fill/ri
                            break
                    return max(1,int(thres))
            return threshold_fn
    else: # flat subsampling
        threshold = lambda x: sampling_threshold

    return {
        "category": category,
        "priority": priority,
        "threshold": threshold
    }