def write_messages_to_tsv(files, bucket_name, metadata_file=None):
    """
    Consume the subscription and write results to tsv manifest
    Args:
        files(dict): a dictionary of object files
            {
                "url": "test_url",
                "md5": "test_md5",
                "size": 1
            }
        bucket_name(str): bucket for uploading the manifest to
        metadata_file(str): metadata file for merging
    """
    metadata_info = {}
    # Default filenames without merging
    fields = ["url", "size", "md5"]

    # merge extra metadata info from file
    if metadata_file:
        with open(metadata_file, "rt") as csvfile:
            csvReader = csv.DictReader(csvfile, delimiter="\t")
            # Build a map with url as the key
            for row in csvReader:
                if "url" in row:
                    metadata_info[row["url"]] = {
                        k: v
                        for k, v in row.items() if k != "url"
                    }

        # do merging if possible, and update fields
        need_merge = False
        first_row_need_merge = None
        for row_num, fi in enumerate(files):
            if fi["url"] in metadata_info:
                need_merge = True
                first_row_need_merge = first_row_need_merge or row_num
                for k, v in metadata_info[fi["url"]].items():
                    fi[k] = v
        if files and need_merge:
            # add new fields
            [
                fields.append(k) for k in files[first_row_need_merge].keys()
                if k not in ["url", "size", "md5"]
            ]

    if len(files) > 0:
        # part the url
        parts = urlparse(files[0]["url"])

        # generate unique manifest output
        now = datetime.now()
        current_time = now.strftime("%m_%d_%y_%H:%M:%S")
        filename = "manifest_{}_{}.tsv".format(parts.netloc, current_time)

        # write list of object metadata to a file
        utils.write_tsv(filename, files, fields)
        # Upload the file to google bucket
        utils.upload_file(bucket_name, filename, filename)

    logging.info("DONE!!!")
Exemple #2
0
def main():

    folder = "./data/transomcs/"
    file = folder + "TransOMCS_full.txt"
    data = read_csv(file, delimiter="\t")

    confidences = {}
    for d in data:
        key = tuple_key(d)
        confidences[key] = float(d[3])

    human_eval_file = folder + "human_evaluation_tuples.tsv"
    tuples = read_csv(human_eval_file, delimiter="\t", skip_header=True)

    updated_t = [{
        "head_event": t[0],
        "relation": t[1],
        "tail_event": t[2]
    } for t in tuples if confidences[tuple_key(t)] >= 0.5]
    dropped = [{
        "head_event": t[0],
        "relation": t[1],
        "tail_event": t[2]
    } for t in tuples if confidences[tuple_key(t)] < 0.5]

    output_file = folder + "human_evaluation_tuples_v2.tsv"
    write_tsv(output_file, updated_t)

    output_file = folder + "dropped_human_evaluation_tuples_v2.tsv"
    write_tsv(output_file, dropped)
Exemple #3
0
def parse():
    r = requests.get(URL)
    if not r.ok:
        print(f"Failed to fetch {URL}", file=sys.stderr)
        exit(1)
        r.close()

    db = json.loads(r.text)
    r.close()

    # Convert to ready made TSVs
    regions = defaultdict(list)
    for row in db:
        elt = [
            int(row[X[c]]) if i > 0 else row[X[c]].split()[0]
            for i, c in enumerate(cols)
        ]
        regions[row[X["region"]]].append(elt)
    regions = dict(regions)

    # Sum all regions to obtain Italian data
    dates = defaultdict(lambda: np.zeros(len(cols) - 1))
    for data in regions.values():
        for datum in data:
            dates[datum[0]] += np.array(datum[1:])

    regions["Italy"] = []
    for date, counts in dates.items():
        regions["Italy"].append([date] + [int(c) for c in counts])

    for region, data in regions.items():
        write_tsv(f"{LOC}/{region}.tsv", cols, data, "italy")
Exemple #4
0
def parse():
    r  = requests.get(URL)
    if not r.ok:
        print(f"Failed to fetch {URL}", file=sys.stderr)
        exit(1)
        r.close()

    db = json.loads(r.text)
    r.close()

    # Convert to ready made TSVs
    regions = defaultdict(list)
    for row in db:
        date = str(row["date"])
        date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
        elt  = [ date, row["positive"], row["death"], None, None, None ]
        regions[acronyms[row["state"]]].append(elt)
    regions = dict(regions)

    for region, data in regions.items():
        write_tsv(f"{LOC}/{region}.tsv", cols, data, "unitedstates")
Exemple #5
0
def parse():
    r  = requests.get(URL)
    if not r.ok:
        print(f"Failed to fetch {URL}", file=sys.stderr)
        exit(1)
        r.close()

    regions = defaultdict(list)
    fd  = io.StringIO(r.text)
    rdr = csv.reader(fd)
    hdr = next(rdr)

    for row in rdr:
        date   = row[0]
        canton = cantonal_codes[row[1]]
        regions[canton].append([date, to_int(row[2]), to_int(row[5]), to_int(row[6]), None, to_int(row[7])])

    for region, data in regions.items():
        if region != "Liechtenstein":
            write_tsv(f"{LOC}/{region}.tsv", cols, data, "switzerland")
        else:
            write_tsv(f"{LOC2}/{region}.tsv", cols, data, "switzerland")
def calculate_all_outliers(samples,
                           thresholds,
                           filtered_genelist_path,
                           outliers_path,
                           verbose=False):
    """For each sample calculate up and down outlier genes based on the thresholds.
    Then, apply expression & variance filters to mark 'noise' genes, and save as result file."""
    print_v = print if verbose else lambda *a, **k: None
    print_v("Calculating outliers for all samples.")

    outliers = samples.apply(single_sample_outliers, args=[thresholds], axis=0)

    # Apply the expression & variance filters that we calculated in step 1 to mark filtered
    # genes as dropped due to Expression or Variance, or as Retained.
    # gene_filter_status is a pd.Series with values eg ["E"]
    gene_filter_status = utils.read_feather(
        filtered_genelist_path)["status"].apply(list)
    outliers = outliers.add(gene_filter_status, axis="index")
    outliers = outliers.applymap("".join)  # Concatenate all values to string

    print_v("Writing outlier results to {}".format(outliers_path))
    utils.write_tsv(outliers, outliers_path)
    return outliers
Exemple #7
0
def parse():
    cases = retrieve_case_data()
    cases = flatten(cases)

    write_tsv(f"{LOC}/World.tsv", cols, cases, "world")
Exemple #8
0
from utils import get_paragraph_words
from utils import make_transposition_pair_dataset
from utils import write_tsv

if __name__ == '__main__':
    paragraphs = get_paragraph_words(500, 20, 60, 3, tags=[[0, 1, 2]])
    train_data, validation_data = make_transposition_pair_dataset(
        paragraphs, 128)

    write_tsv(train_data, 'train.tsv')
    write_tsv(validation_data, 'dev.tsv')
Exemple #9
0
match_ids = get_match_ids(fn=match_ids_fn)

data_fn = '../output/match_player_data.tsv'
data = pd.read_csv(data_fn, sep='\t')f

dupes = data[data['match_id_slot'].duplicated()]
print(dupes)

data[data['match_id'] == 5536512014]
print(len(data))

print(len(data.drop_duplicates()))

import utils
match_kills_out_fn = '../output/match_player_data.tsv'
utils.write_tsv(data=data.drop_duplicates(), fn=match_kills_out_fn)

data_ids = set(data['match_id'])

missing_mIDs = set(match_ids) - data_ids
print(len(missing_mIDs))

match_ids.index(list(missing_mIDs)[0])
match_ids.index(list(missing_mIDs)[1])

# import csv
# # cw = csv.writer(open('../input/missing_mIDs.csv', 'w'))
# # cw.writerow(list(missing_mIDs))
# with open('../input/missing_mIDs.txt', 'w') as outfile:
#     # write(list(missing_mIDs))
#     writer = csv.writer(outfile)
    # Update df with first blood killers
    rm_no_kills.loc[rm_no_kills['match.id.slot'].isin(fb_kill_mID_slots),
                    'first.blood'] = 1

    return rm_no_kills


if __name__ == '__main__':
    json_in_fn = '../input/match_data.json'

    matches = utils.load_match_json(json_in_fn=json_in_fn)

    match_kills = get_player_data(matches=matches)

    match_kills_out_fn = '../output/match_player_data_v2.tsv'
    utils.write_tsv(data=match_kills, fn=match_kills_out_fn)

    print("Player level data for each match:")
    print(f" - Saved {len(matches)} matches to output folder")

# Junk:
# print(matches[0]['start_time'])
# matches = json.loads(json_file)

# print(matches[0]['match_id'])
# print(matches[1]['match_id'])

# matches[0].keys()

# type(matches)
# len(matches)