Esempio n. 1
0
def parse_metadata(segments, metadata_files, date_format="%Y-%m-%d"):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt=date_format)
        for x in tmp_meta:
            try:
                tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            except:
                try:
                    tmp_meta[x]['num_date'] = int(
                        tmp_meta[x]['date'][-4:]) + 0.5
                except:
                    tmp_meta[x]['num_date'] = np.nan
                    continue
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12)
            if 'age' in tmp_meta[x]:
                age_str = tmp_meta[x]['age']
                if age_str[-1] == 'y':
                    tmp_meta[x]['age'] = int(age_str[:-1])
                elif tmp_meta[x]['age'] == 'm':
                    tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0
                else:
                    tmp_meta[x]['age'] = 'unknown'

        metadata[segment] = tmp_meta
    return metadata
def parse_metadata(segments, metadata_files):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d')
        for x in tmp_meta:
            tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])

            # Extract month values starting at January == 1 for comparison with
            # datetime objects.
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12) + 1
        metadata[segment] = tmp_meta
    return metadata
Esempio n. 3
0
def parse_metadata(segments, metadata_files):
    metadata = {}
    for segment, fname in zip(segments, metadata_files):
        tmp_meta, columns = read_metadata(fname)

        numerical_dates = get_numerical_dates(tmp_meta, fmt='%Y-%m-%d')
        for x in tmp_meta:
            tmp_meta[x]['num_date'] = np.mean(numerical_dates[x])
            tmp_meta[x]['year'] = int(tmp_meta[x]['num_date'])
            tmp_meta[x]['month'] = int((tmp_meta[x]['num_date'] % 1) * 12)
            age_str = tmp_meta[x]['age']
            if age_str[-1] == 'y':
                tmp_meta[x]['age'] = int(age_str[:-1])
            elif tmp_meta[x]['age'] == 'm':
                tmp_meta[x]['age'] = float(age_str[:-1]) / 12.0
            else:
                tmp_meta[x]['age'] = 'unknown'

        metadata[segment] = tmp_meta
    return metadata
Esempio n. 4
0
        "Estimate TMRCA assuming a star topology and a poisson mutation process",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--node-data",
                        required=True,
                        help="JSON with ancestral reconstruction")
    parser.add_argument("--metadata",
                        required=True,
                        help="JSON with ancestral reconstruction")
    parser.add_argument("--tree", required=True, help="newick tree")
    parser.add_argument("--output", required=True, help="figure file")
    args = parser.parse_args()

    T = Phylo.read(args.tree, 'newick')

    metadata, columns = read_metadata(args.metadata)
    dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
    node_data = read_node_data(args.node_data, args.tree)

    tips = {}
    for n in T.get_terminals():
        if type(dates[n.name]) == list:
            continue
        tips[n.name] = {'numdate': dates[n.name], 'mutations': []}
        path = T.root.get_path(target=n)
        for c in path:
            tips[n.name]['mutations'].extend([
                x for x in node_data['nodes'][c.name]['muts']
                if not (x[0] in ['N', '-'] or x[-1] in ['N', '-'])
            ])

    tmrca = np.linspace(2019.7,
Esempio n. 5
0
        "--reference-strains",
        help=
        "text file containing list of reference strains that should be included from the original strains even if they were sampled prior to the minimum date determined by the requested number of years before the given timepoint"
    )
    args = parser.parse_args()

    # Convert date string to a datetime instance.
    timepoint = pd.to_datetime(args.timepoint)
    numeric_timepoint = np.around(numeric_date(timepoint), 2)

    # Load metadata with strain names and dates.
    metadata, columns = read_metadata(args.metadata)

    # Convert string dates with potential ambiguity (e.g., 2010-05-XX) into
    # floating point dates.
    dates = get_numerical_dates(metadata, fmt="%Y-%m-%d")

    # Setup reference strains.
    if args.reference_strains:
        reference_strains = read_strain_list(args.reference_strains)
    else:
        reference_strains = []

    # If a given number of years back has been requested, determine what the
    # earliest date to accept for strains is.
    if args.years_back is not None:
        earliest_timepoint = timepoint - pd.DateOffset(years=args.years_back)
        numeric_earliest_timepoint = np.around(
            numeric_date(earliest_timepoint), 2)

        # If reference strains are provided, calculate the earliest date to