Exemple #1
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    flags.mark_flag_as_required('prediction_file')

    sources, predictions, _ = score_lib.read_data(FLAGS.prediction_file,
                                                  FLAGS.case_insensitive)
    ref_filepaths = [
        get_data_filepath('turkcorpus', 'valid', 'simple.turk', i)
        for i in range(8)
    ]
    target_lists = [read_lines(ref_filepath) for ref_filepath in ref_filepaths]
    logging.info(f'Read file: {FLAGS.prediction_file}')
    turk_scores = get_all_scores(orig_sents=sources,
                                 sys_sents=predictions,
                                 refs_sents=target_lists)
    logging.info("[turk] {}".format(turk_scores))
    ref_filepaths = [
        get_data_filepath('asset', 'valid', 'simp', i) for i in range(10)
    ]
    target_lists = [read_lines(ref_filepath) for ref_filepath in ref_filepaths]
    asset_scores = get_all_scores(orig_sents=sources,
                                  sys_sents=predictions,
                                  refs_sents=target_lists)
    logging.info("[asset] {}".format(asset_scores))
def test_model(config_file, model, device):
    """
    :param config_file: (string) path to a '.yaml' configuration file.
    :param model: (MalConv).
    :param device: the device that was used for the model.
    """
    # load configurations
    try:
        conf = yaml.load(open(config_file, 'r'))
    except:
        print('Error with test configuration yaml')
        sys.exit()

    classes = utils.read_lines(conf[LABELS])
    i2l = {i: l for i, l in enumerate(classes)}
    l2i = {l: i for i, l in i2l.iteritems()}

    # create loader
    files = utils.read_lines(conf[FILES_LS_PATH])  # list files to predict on
    test_loader = DataLoader(ExeDatasetNoLabels(files, l2i, conf[NUM_BYTES]),
                             batch_size=1,
                             shuffle=False,
                             num_workers=conf[WORKERS])

    # predict
    with open(conf[TARGET_FILE], 'w') as f:
        for x in test_loader:  # todo check this
            if device is not None:
                x = x.to(device)
            pred = model(x)
            pred_label = torch.max(pred, 1)[1].item()
            f.write('{}\n'.format(pred_label))
        f.close()
Exemple #3
0
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "segmented"))
    filenames = [n for n in filenames if n.endswith(".txt")]
    filenames.sort()

    utils.mkdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))

    for filename in pyprind.prog_bar(filenames):
        path_seg = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "segmented", filename)
        path_raw = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "raw", filename)
        path_dst = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "preprocessed",
                                filename.replace(".txt", ".edus"))
        # Input
        edus = utils.read_lines(path_seg, process=lambda line: line)
        edus = remove_empty_lines(filename, edus)
        raw_lines = utils.read_lines(path_raw, process=lambda line: line)
        raw_lines = remove_empty_lines(filename, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Processing
        edus = convert_edus(edus, raw_lines)
        assert count_chars(edus) == count_chars(raw_lines)
        # Output
        utils.write_lines(path_dst, edus)
Exemple #4
0
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".edus")]
    filenames.sort()

    n_skipped = 0
    for file_i, filename in enumerate(filenames):
        path_s = os.path.join(
            config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing",
            filename.replace(".edus", ".sentence.boundaries"))
        path_p = os.path.join(
            config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing",
            filename.replace(".edus", ".paragraph.boundaries"))

        if not os.path.exists(path_s):
            print("Skipped %s because %s doesn't exist." % (filename, path_s))
            n_skipped += 1
            continue
        if not os.path.exists(path_p):
            print("Skipped %s because %s doesn't exist." % (filename, path_p))
            n_skipped += 1
            continue

        sbnds = utils.read_lines(
            path_s, process=lambda l: tuple([int(x) for x in l.split()]))
        pbnds = utils.read_lines(
            path_p, process=lambda l: tuple([int(x) for x in l.split()]))

        sbnds_proj, n_edus = project_pbnds_to_sbnds(sbnds=sbnds, pbnds=pbnds)
        if sbnds != sbnds_proj:
            print("Projected paragraph boundaries into the sentence boundaries (+%d): %s" % \
                    (len(sbnds_proj) - len(sbnds), path_s))

        test_boundaries(sbnds_proj, n_edus)

        pbnds = replace_subtrees_with_ids(sbnds=sbnds_proj, pbnds=pbnds)

        write_boundaries(
            sbnds,
            os.path.join(
                config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed",
                filename.replace(".edus", ".sentence.noproj.boundaries")))
        write_boundaries(
            sbnds_proj,
            os.path.join(
                config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed",
                filename.replace(".edus", ".sentence.proj.boundaries")))
        write_boundaries(
            pbnds,
            os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                         "preprocessed",
                         filename.replace(".edus", ".paragraph.boundaries")))

    print("Skipped %d files." % n_skipped)
Exemple #5
0
def main(args):
    path = args.path

    filenames = os.listdir(path)
    filenames = [n for n in filenames if n.endswith(".edus.arcs")]
    filenames.sort()

    for filename in pyprind.prog_bar(filenames):
        # EDUs (sub-arcs, tokens, POS tags)
        edus_arcs = utils.read_lines(os.path.join(path, filename),
                                     process=lambda line: line.split())
        edus_tokens = utils.read_lines(os.path.join(
            path, filename.replace(".edus.arcs", ".edus.tokens")),
                                       process=lambda line: line.split())
        edus_postags = utils.read_lines(os.path.join(
            path, filename.replace(".edus.arcs", ".edus.postags")),
                                        process=lambda line: line.split())

        heads = []
        for tokens, postags, arcs in zip(edus_tokens, edus_postags, edus_arcs):
            arcs = treetk.hyphens2arcs(arcs)

            # Check: Arcs should be arranged in ascending order wrt dependent
            prev_d = -1
            for h, d, l in arcs:
                assert d > prev_d
                prev_d = d

            head_idx = None

            # If its head is the root, it is the head of the EDU
            for idx, (h, d, l) in enumerate(arcs):
                if h == 0:
                    assert l == "ROOT"  # TODO
                    head_idx = idx
                    break

            # If its head is outside the span, it is the head of the EDU
            if head_idx is None:
                span_min = arcs[0][1]
                span_max = arcs[-1][1]
                for idx, (h, d, l) in enumerate(arcs):
                    if h < span_min or h > span_max:
                        head_idx = idx
                        break

            # Head token, POS tag, and dependency relation
            head_token = tokens[head_idx]
            head_postag = postags[head_idx]
            head_deprel = arcs[head_idx][2]
            heads.append((head_token, head_postag, head_deprel))

        # Write
        with open(
                os.path.join(path, filename.replace(".edus.arcs",
                                                    ".edus.heads")), "w") as f:
            for token, postag, deprel in heads:
                f.write("%s %s %s\n" % (token, postag, deprel))
def process(path, check_token, check_char, check_boundary):

    filenames = os.listdir(path)
    filenames = [n for n in filenames if n.endswith(".edus.tokens")]
    filenames.sort()

    confliction = False
    for filename in filenames:

        # Gold EDUs
        lines_e = utils.read_lines(os.path.join(path, filename))
        text_e = " ".join(lines_e)  # str
        edus = [l.split() for l in lines_e]  # List[List[str]]

        # Paragraphs
        lines_d = utils.read_lines(
            os.path.join(path, filename.replace(".edus.tokens",
                                                ".doc.tokens")))
        text_d = " ".join(lines_d)  # str
        paras = []  # List[List[str]]
        para = [lines_d[0].split()]
        for i in range(1, len(lines_d)):
            line = lines_d[i].split()
            if len(line) == 0 and len(para) == 0:
                continue
            elif len(line) == 0 and len(para) != 0:
                paras.append(para)
                para = []
            else:
                para.append(line)
        if len(para) != 0:
            paras.append(para)

        # Test
        if check_token and not test_tokenlevel_confliction(text_e, text_d):
            print("Found token-level confliction: %s" %
                  os.path.join(path, filename))
            confliction = True

        if check_char and not test_charlevel_confliction(text_e, text_d):
            print("Found char-level confliction: %s" %
                  os.path.join(path, filename))
            confliction = True

        if check_boundary and not test_boundary_confliction(edus, paras):
            print("Found paragraph-boundary confliction: %s" %
                  os.path.join(path, filename))
            confliction = True

    if not confliction:
        print("Found NO confliction: OK")
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")]
    filenames = [
        n.replace(".paragraph.boundaries", ".edus") for n in filenames
    ]
    filenames.sort()

    with open(
            os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                         "tmp.preprocessing", "filelist.corenlp2.txt"),
            "w") as ff:
        for filename in filenames:
            # Path
            path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                     "tmp.preprocessing",
                                     filename + ".tokenized")
            path_sbnds = os.path.join(
                config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed",
                filename.replace(".edus", ".sentence.noproj.boundaries"))
            path_sents = os.path.join(config.getpath("data"),
                                      "ptbwsj_wo_rstdt", "tmp.preprocessing",
                                      filename.replace(".edus", ".sentences"))

            # Read
            edus = utils.read_lines(
                path_edus,
                process=lambda line: line.split())  # list of list of str
            sbnds = utils.read_lines(
                path_sbnds,
                process=lambda line:
                (int(x) for x in line.split()))  # list of (int, int)

            # Create sentences based on the sentence boundaries
            sentences = []
            for begin_i, end_i in sbnds:
                sentence = edus[begin_i:end_i + 1]  # list of list of str
                sentence = utils.flatten_lists(sentence)  # list of str
                sentences.append(sentence)

            # Write
            with open(path_sents, "w") as fs:
                for sentence in sentences:
                    fs.write("%s\n" % " ".join(sentence))
            ff.write("%s\n" % path_sents)
    def __build_feats(datasets_info):
        """ Builds the feature and label vectors from the specified datasets

		Arguments:
		----------
			datasets_info:
				type: list
				info: list of dictionaries containing:
					- dataset_file (string)
					- dataset_label (string)

		Returns:
		----------
			samples:
				type: list
				info: contains all the sentences

			labels:
				type: list
				info: contains all the sentences labels
		"""

        samples = []
        labels = []

        for info in datasets_info:
            name = info['dataset_name']
            label = info['dataset_label']

            sentences = read_lines(file_name=name, file_type='dataset')

            samples.extend(sentences)
            labels.extend([label] * len(sentences))

        return samples, labels
Exemple #9
0
def clean(dir_):
    """Filters the dir_ directory (splits into dir_cleaned and dir_junk)."""
    dir_cleaned = join(DATA_DIR, dir_ + '_cleaned_final')
    _make_labeled_dir_structure(dir_cleaned)
    dir_junk = join(DATA_DIR, dir_ + '_junk_final')
    _make_labeled_dir_structure(dir_junk)

    black_list = read_lines(IMAGES_BLACKLIST_FILE,
                            line_func=lambda l: l.rstrip())

    for class_dir in CLASSES:

        class_dir_abs = join(DATA_DIR, dir_, class_dir)
        for file_name in listdir(class_dir_abs):

            if not file_name.endswith('.jpg'):
                continue

            src_path = join(class_dir_abs, file_name)

            if _is_clean_image(black_list, src_path):
                dest_path = join(dir_cleaned, class_dir, file_name)
            else:
                dest_path = join(dir_junk, class_dir, file_name)

            copyfile(src_path, dest_path)
Exemple #10
0
def main():
    m = {'#': 1, '.': 0}
    l = lmap(lambda x: [m[i] for i in x[:-1]], read_lines())
    ast = []

    for i in range(len(l)):
        for j in range(len(l[0])):
            if l[i][j]:
                ast.append((j, i))

    s = []
    for i in ast:
        angles = set()
        for j in ast:
            if i != j:
                k = compute_k(i, j)
                angles.add(k)
        s.append(len(angles))

    i = np.argmax(s)
    print(s[i])

    best = ast[i]
    dd = collections.defaultdict(list)
    for i in ast:
        if i != best:
            k = compute_k(i, best)
            dd[k].append(i)

    d = dict(dd)
    s = sorted([j for i in d.values() for j in compute_angles_group(i, best)])
    print(s[199][1])
Exemple #11
0
def csv_to_npz(csv_file_name):
    print("reading file '{}'".format(csv_file_name), flush=True)

    categories = read_lines("/storage/kaggle/quickdraw/categories.txt")

    df = pd.read_csv(
        csv_file_name,
        index_col="key_id",
        converters={
            "word": lambda word: categories.index(word),
            "drawing": lambda drawing: np.array(eval(drawing))
        })

    df = df.rename(columns={"word": "category"})

    key_id = np.array(df.index.values, dtype=np.int64)
    drawing = np.array(df.drawing.values, dtype=np.object)
    category = np.array(df.category.values, dtype=np.int16)
    recognized = np.array(df.recognized.values, dtype=np.bool)
    countrycode = np.array(df.countrycode.values, dtype=np.object)

    npz_file_name = csv_file_name[:-4] + ".npz"
    print("writing file '{}'".format(npz_file_name), flush=True)
    np.savez_compressed(
        npz_file_name,
        key_id=key_id,
        drawing=drawing,
        category=category,
        recognized=recognized,
        countrycode=countrycode)

    return None
Exemple #12
0
def main():
    seats = [Seat(line) for line in utils.read_lines()]

    max_uid = 0
    for seat in seats:
        max_uid = max(max_uid, seat.uid)
    print("Part1: {}".format(max_uid))

    max_row_id = Seat('BBBBBBBRRR').uid

    seats_exist = []
    for i in range(max_row_id):
        found = False
        for seat in seats:
            # print(seat.uid)
            if seat.uid == i:
                found = True
                break

        seats_exist.append(found)

    # print(seats_exist)

    for i in range(8, max_row_id):
        if seats_exist[i-1] and seats_exist[i + 1] and not seats_exist[i]:
            print('Part2: {}'.format(i))
            break
Exemple #13
0
def part_two():
    def is_valid(line):
        pos_1, pos_2, char, password = parse_policy(line)
        return (password[int(pos_1) - 1] == char) != (password[int(pos_2) - 1]
                                                      == char)

    return sum(is_valid(line) for line in read_lines(day=2))
Exemple #14
0
def eval_on(config_file, model, device):
    """
    :param config_file: (string) path to a '.yaml' configuration file.
    :param model: (MalConv).
    :param device: the device that was used for the model.
    """
    try:
        conf = yaml.load(open(config_file, 'r'))
    except:
        print('Error with dev configuration yaml')
        sys.exit()

    classes = utils.read_lines(conf[LABELS])
    i2l = {i: l for i, l in enumerate(classes)}
    l2i = {l: i for i, l in i2l.iteritems()}

    path2label = utils.create_path2label_dict(conf[MAIN_DIR], conf[L2DIR])
    size_dev = len(path2label)
    keys = list(path2label.viewkeys())
    np.random.shuffle(keys)
    dev_set = [(key, path2label[key]) for key in keys]
    fps_dev, y_dev = utils.split_to_files_and_labels(dev_set)

    validloader = DataLoader(ExeDataset(fps_dev, y_dev, l2i, conf[NUM_BYTES]),
                             batch_size=conf[BATCH],
                             shuffle=False,
                             num_workers=conf[WORKERS])
    acc, t = validate_dev_set(validloader, model, device, size_dev,
                              conf[CONF_MAT])
    print 'time-dev: {:.2f} dev-acc: {:.4f}'.format(t, acc)
Exemple #15
0
def main(args):
    config = utils.Config()

    utils.mkdir(os.path.join(config.getpath("data"), "rstdt-vocab"))

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "rstdt", "renamed"))
    filenames = [n for n in filenames if n.endswith(".edus")]
    filenames.sort()

    with open(
            os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing",
                         "concat.edus.heads.deprel"), "w") as f:
        for filename in filenames:
            deprels = utils.read_lines(os.path.join(config.getpath("data"),
                                                    "rstdt", "renamed",
                                                    filename + ".heads"),
                                       process=lambda line: line.split()[-1])
            for deprel in deprels:
                f.write("%s\n" % deprel)

    if args.with_root:
        special_words = ["<root>"]
    else:
        special_words = []
    textpreprocessor.create_vocabulary.run(
        os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing",
                     "concat.edus.heads.deprel"),
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "deprels.vocab.txt"),
        prune_at=10000000,
        min_count=-1,
        special_words=special_words,
        with_unk=True)
Exemple #16
0
def extract_abstract(path):
    lines = utils.read_lines(path, process=lambda line: line)
    abst_lines = []
    in_abstract = False
    for line in lines:
        if line.lower().startswith("abstract"):
            in_abstract = True
            # Remove the beggining token(="Abstract", "ABSTRACT", etc.)
            tokens = line.split()
            tokens = tokens[1:]
            if len(tokens) > 0:
                line = " ".join(tokens)
                abst_lines.append(line)
        elif "introduction" in line.lower() or line.startswith("1"):
            in_abstract = False
            break
        elif in_abstract:
            abst_lines.append(line)
    while True:
        length = len(abst_lines)
        for i in range(len(abst_lines)):
            if abst_lines[i].endswith("-") and len(
                    abst_lines[i]) != 1 and abst_lines[i][-2] != "-":
                if i + 1 < len(abst_lines):
                    line = abst_lines[i]
                    line = line[:-1]
                    abst_lines[i + 1] = line + abst_lines[i + 1]
                    abst_lines.pop(i)
                    break
        if length == len(abst_lines):
            break
    abst_text = " ".join(abst_lines)
    return abst_text
Exemple #17
0
def main(args):
    path = args.path

    filenames = os.listdir(path)
    filenames = [n for n in filenames if n.endswith(".edus.arcs")]
    filenames.sort()

    for filename in pyprind.prog_bar(filenames):
        edus_arcs = utils.read_lines(os.path.join(path, filename),
                                     process=lambda line: line.split())

        edus_deprels = []
        for arcs in edus_arcs:
            arcs = treetk.hyphens2arcs(arcs)
            deprels = [l for h, d, l in arcs]
            edus_deprels.append(deprels)

        # Write
        with open(
                os.path.join(path,
                             filename.replace(".edus.arcs", ".edus.deprels")),
                "w") as f:
            for deprels in edus_deprels:
                deprels = " ".join(deprels)
                f.write("%s\n" % deprels)
Exemple #18
0
def test_parse_instructions():
    program = utils.read_lines("data/08_tests.data")
    compiler = Compiler(program)
    instruction = "acc -99"
    expected = {"type": "acc", "argument": -99}
    result = compiler.parse_instruction(instruction)
    assert expected == result
    def decode(self, output=None, remove_unk=False, raw_output=False, max_test_size=None, **kwargs):
        utils.log('starting decoding')

        # empty `test` means that we read from standard input, which is not possible with multiple encoders
        # assert len(self.src_ext) == 1 or self.filenames.test
        # check that there is the right number of files for decoding
        # assert not self.filenames.test or len(self.filenames.test) == len(self.src_ext)

        output_file = None
        try:
            output_file = sys.stdout if output is None else open(output, 'w')
            paths = self.filenames.test or [None]
            lines = utils.read_lines(paths, binary=self.binary)

            if max_test_size:
                lines = itertools.islice(lines, max_test_size)

            if not self.filenames.test:  # interactive mode
                batch_size = 1
            else:
                batch_size = self.batch_size
                lines = list(lines)

            hypothesis_iter = self.decode_batch(lines, batch_size, remove_unk=remove_unk)

            for hypothesis, raw in hypothesis_iter:
                if raw_output:
                    hypothesis = raw

                output_file.write(hypothesis + '\n')
                output_file.flush()
        finally:
            if output_file is not None:
                output_file.close()
	def __init__(self, lang):

		""" Creates a text tokenizer object

		Arguments:
		----------
			lang:
				type: string
				info: language to perform the tokenizer process
		"""

		if lang not in languages:
			exit('Invalid language')

		self.lemmatizer = SnowballStemmer(lang)
		self.tokenizer = TweetTokenizer(
			preserve_case = False,
			reduce_len = True,
			strip_handles = True
		)

		self.stopwords = set(read_lines(
			file_name = lang + '.txt',
			file_type = 'stopwords'
		))
Exemple #21
0
def load_train_info():
    train_info = read_lines(TRAIN_INFO_FILE)[1:]
    parsed_train_info = {}
    for l in train_info:
        split = l.split(',')
        parsed_train_info[split[0]] = split[1]
    return parsed_train_info
Exemple #22
0
def prepare_shards():
    num_shards = 50

    if os.path.isdir("/storage/kaggle/quickdraw/train_simplified_shards"):
        shutil.rmtree("/storage/kaggle/quickdraw/train_simplified_shards")
    os.makedirs("/storage/kaggle/quickdraw/train_simplified_shards")

    categories = read_lines("/storage/kaggle/quickdraw/categories.txt")

    for category in categories:
        csv_file_name = "/storage/kaggle/quickdraw/train_simplified/{}.csv".format(category)

        print("processing file '{}'".format(csv_file_name), flush=True)

        df = pd.read_csv(csv_file_name, index_col="key_id")

        shard_size = math.ceil(len(df) / num_shards)
        indexes = df.index.values
        np.random.shuffle(indexes)

        for s in range(num_shards):
            start = s * shard_size
            end = min(start + shard_size, len(df))
            shard_df = df[df.index.isin(indexes[start:end])]
            shard_file_name = "/storage/kaggle/quickdraw/train_simplified_shards/shard-{}.csv".format(s)
            write_csv_header = not os.path.isfile(shard_file_name)
            with open(shard_file_name, "a") as shard_file:
                shard_df.to_csv(shard_file, header=write_csv_header)
Exemple #23
0
def main():
    path_len = 100
    nb_feature = 16
    weibo_file = os.path.join(project_folder, 'dataset', 'weibo', 'weibo.txt')
    lines = utils.read_lines(weibo_file)
    x = []
    y = []
    i = 1
    for line in lines:
        print(i)
        i += 1
        line = line.replace('\t', ' ')
        sp = line.split(' ')
        eid = sp[0].split(':')[1]
        label = sp[1].split(':')[1]
        y.append(int(label))
        f = []
        json_file = os.path.join(project_folder, 'dataset', 'weibo', 'Weibo',
                                 eid + '.json')
        text_content = utils.read(json_file)
        json_content = json.loads(text_content)
        for post in json_content[0:path_len]:
            f.append(get_feature(post))
        if len(f) < path_len:
            for j in range(path_len - len(f)):
                f.append([0 for j in range(nb_feature)])
        x.append(f)

    y = numpy.array(y)
    x = numpy.array(x)

    print(x.shape, y.shape)
    numpy.save(os.path.join(project_folder, 'feature', 'weibo', 'x.npy'), x)
    numpy.save(os.path.join(project_folder, 'feature', 'weibo', 'y.npy'), y)
def get_active(dimensions):
    cubes = set()
    for y, row in enumerate(read_lines(day=17)):
        for x, cube in enumerate(row):
            if cube == "#":
                cubes.add(tuple([y, x] + [0 for _ in range(dimensions - 2)]))
    return cubes
Exemple #25
0
def parse_raw_file(file):
    '''
    将原始文件解析为qa对
    原始文件格式:
    Q: hello
    Q: hi
    A: hello
    A: hi
    
    Q: ...
    A: ...
    yield: [q,a]
    '''
    qa = {'q': set(), 'a': set()}
    for line in read_lines(file, yield_null=True):
        if not line:
            if len(qa['q']) > 0 and len(qa['a']) > 0:
                for q in qa['q']:
                    for a in qa['a']:
                        yield [q, a]
            qa = {'q': set(), 'a': set()}
        elif line[0] == 'Q':
            if line[3:]:
                qa['q'].add(line[3:])
        elif line[0] == 'A':
            if line[3:]:
                qa['a'].add(line[3:])
    if len(qa['q']) > 0 and len(qa['a']) > 0:
        for q in qa['q']:
            for a in qa['a']:
                yield [q, a]
    qa = {'q': set(), 'a': set()}
Exemple #26
0
def answer_analogy_questions(analogies_path, embeddings, words2ids, top_k):
    all_questions_init = read_lines(analogies_path)
    # lowercase everything
    all_questions_low = [[j.lower() for j in i] for i in all_questions_init]
    # get rid of oov
    all_questions = [
        q for q in all_questions_low
        if q[0] == ":" or (q[0] in words2ids and q[1] in words2ids
                           and q[2] in words2ids and q[3] in words2ids)
    ]
    results = []
    group = []
    print('group_name', '1nn%', '10nn%')
    # answer questions, combining them in groups
    for line in all_questions:
        if line[0] == ':':
            if group:  # if group is not empty, evaluate and print results
                results[-1].extend(
                    answer_questions_in_group(group, embeddings, words2ids,
                                              top_k))
                print(results[-1][0], '%.2f' % results[-1][1],
                      '%.2f' % results[-1][2])
                group = []
            group_name = line[1]
            results.append([group_name])
        else:
            group.append(line)
    # handle last group's results
    results[-1].extend(
        answer_questions_in_group(group, embeddings, words2ids, top_k))
    print(results[-1][0], '%.2f' % results[-1][1], '%.2f' % results[-1][2])
    # print overall results
    n_syntactic = sum(1 for r in results if r[0].startswith('gram'))
    summarize_analogies_results(results, n_syntactic)
def create_averaged_submission(weighted_submissions):
    names, probs, weights = [], [], []

    for file_name, weight in weighted_submissions.items():
        file_path = join(SUBMISSIONS_DIR, file_name)
        lines = read_lines(file_path)[1:]

        weights.append(weight)

        single_probs = []
        names = []

        for line in lines:
            split = line.rstrip().split(',')
            names.append(split[0])
            single_probs.append(np.array([float(x) for x in split[1:]]))

        probs.append(np.array(single_probs))

    probs = np.array(probs)
    weights = np.array(weights)

    averaged = probs * weights[:, np.newaxis, np.newaxis]
    averaged = np.sum(averaged, axis=0) / np.sum(weights)

    submissions_file = join(
        SUBMISSIONS_DIR,
        'averaged_all.csv'
    )
    create_submission_file(names, averaged, submissions_file)
Exemple #28
0
def ranking_precision_in_pruney(pruney, problem_dir, problem_source):
    precision = 0.0
    for name in pruney:
        problem_file = os.path.join(problem_dir, name)
        problem_lines = read_lines(problem_file)
        if problem_source == "Vampire":
            selected_names = [line.split(",")[0].replace("tff(", "")
                              for line in problem_lines
                              if "tff" in line and "axiom" in line]
        if problem_source == "E":
            selected_names = [line.split(", ")[0].replace(
                "fof(", "") for line in problem_lines
                if "fof" in line and "file" in line and "axiom" in line]
        if problem_source == "Q_selection":
            selected_names = [line.split(",")[0].replace(
                "fof(", "") for line in problem_lines
                if "fof" in line and "axiom" in line]
            print(selected_names)
        proofs = pruney[name]
        temp = [len(proof)
                for proof in proofs if proof.issubset(set(selected_names))]
        if temp:
            precision += (max(temp) + 1) / (len(selected_names) + 1)
    precision = precision / len(pruney)
    return precision
def part_two():
    precedences = {
        Token.PLUS: 1,
        Token.ASTERISK: 0,
        Token.LPAREN: 2,
    }

    return sum(run(program, precedences) for program in read_lines(day=18))
Exemple #30
0
def ranking_precision(problem_dir, output_dir, ATP, problem_source):
    filenames = os.listdir(output_dir)
    precision = 0.0
    counter = 0
    for name in filenames:

        output_file = os.path.join(output_dir, name)
        lines = read_lines(output_file)
        if ATP == "E" and "# Proof found!" in lines and \
                "# SZS status Theorem" in lines:
            counter += 1
            useful_names = extract_useful_premises_from_E(lines)

            problem_file = os.path.join(problem_dir, name)
            problem_lines = read_lines(problem_file)

            if problem_source == "Vampire":
                problem_len = compute_selected_problem_from_Vampire(
                    problem_lines)
            if problem_source == "E":
                problem_len = compute_selected_problem_from_E(problem_lines)
            if problem_source == "Q_selection":
                problem_len = len(problem_lines)

            precision += len(useful_names) / problem_len

        if ATP == "Vampire" and "% Refutation found. Thanks to Tanya!" \
                in lines:
            counter += 1
            useful_names = extract_useful_premises_from_Vampire(lines)

            problem_file = os.path.join(problem_dir, name)
            problem_lines = read_lines(problem_file)

            if problem_source == "Vampire":
                problem_len = compute_selected_problem_from_Vampire(
                    problem_lines)
            if problem_source == "E":
                problem_len = compute_selected_problem_from_E(problem_lines)
            if problem_source == "Q_selection":
                problem_len = len(problem_lines)

            precision += len(useful_names) / problem_len

    precision = precision / counter
    return precision, counter
def load_cache(fn):
    lines = read_lines(fn)

    pairs = [line.strip().split() for line in lines]
    pairs = filter(lambda pair: len(pair) == 2, pairs)
    cache = dict(pairs)

    # Update cache with actual None values instead of the string 'None'
    for k, v in cache.items():
        if v == 'None':
            cache[k] = None
    return cache