Exemple #1
0
def get_triples(cluster_data, max_triples_per_page):
    all25_triples = []
    for c in trange(len(cluster_data)):
        text = cluster_data[c].texts
        t = list(cluster_data[c].label)
        triples = []
        page_done = False
        for i in range(len(t) - 2):
            for j in range(i + 1, len(t) - 1):
                for k in range(i + 2, len(t)):
                    if len(set([t[i], t[j], t[k]])) == 2:
                        if t[i] == t[j]:
                            triples.append(
                                InputExample(texts=[text[i], text[j], text[k]],
                                             label=0))
                        elif t[j] == t[k]:
                            triples.append(
                                InputExample(texts=[text[j], text[k], text[i]],
                                             label=0))
                        else:
                            triples.append(
                                InputExample(texts=[text[i], text[k], text[j]],
                                             label=0))
                        if max_triples_per_page > 0 and len(
                                triples) >= max_triples_per_page:
                            page_done = True
                            break
                if page_done:
                    break
            if page_done:
                break
        all25_triples += triples
    return all25_triples
Exemple #2
0
def prepare_cluster_data(train_pages_to_cluster, test_pages_to_cluster,
                         val_samples):
    ng_train = fetch_20newsgroups(subset='train',
                                  remove=('headers', 'footers', 'quotes'))
    ng_test = fetch_20newsgroups(subset='test',
                                 remove=('headers', 'footers', 'quotes'))
    print(ng_train.target_names)

    ng_train.keys()
    train_cluster_data = []
    test_cluster_data = []
    for i in range(len(ng_train['filenames']) // train_pages_to_cluster):
        train_cluster_data.append(
            InputExample(
                texts=ng_train['data'][i * train_pages_to_cluster:(i + 1) *
                                       train_pages_to_cluster],
                label=ng_train['target'][i * train_pages_to_cluster:(i + 1) *
                                         train_pages_to_cluster]))
    val_cluster_data = train_cluster_data[-val_samples:]
    train_cluster_data = train_cluster_data[:-val_samples]
    for i in range(len(ng_test['filenames']) // test_pages_to_cluster):
        test_cluster_data.append(
            InputExample(
                texts=ng_test['data'][i * test_pages_to_cluster:(i + 1) *
                                      test_pages_to_cluster],
                label=ng_test['target'][i * test_pages_to_cluster:(i + 1) *
                                        test_pages_to_cluster]))
    print("Train instances: %5d" % len(train_cluster_data))
    print("Val instances: %5d" % len(val_cluster_data))
    print("Test instances: %5d" % len(test_cluster_data))

    return train_cluster_data, val_cluster_data, test_cluster_data
 def get_examples(self, filename, max_examples=0):
     """
     filename specified which data split to use (train.csv, dev.csv, test.csv).
     """
     filepath = os.path.join(self.dataset_folder, filename)
     self.data = preprocess_crr.read_crr_tsv_as_df(filepath)
     self.negative_sampler = negative_sampling.RandomNegativeSampler(
         list(self.data["response"].values), 1)
     examples = []
     for idx, row in enumerate(
             tqdm(self.data.itertuples(index=False), total=len(self.data))):
         context = row[0]
         relevant_response = row[1]
         examples.append(
             InputExample(guid=filename + str(idx) + "_pos",
                          texts=[context, relevant_response],
                          label=1.0))
         ns_candidates, _, _ = self.negative_sampler.sample(
             context, relevant_response)
         for ns in ns_candidates:
             examples.append(
                 InputExample(guid=filename + str(idx) + "_neg",
                              texts=[context, ns],
                              label=0.0))
     return examples
Exemple #4
0
def get_frac_triples(cluster_data, num_triples_frac):
    frac_triples = []
    for c in trange(len(cluster_data)):
        text = cluster_data[c].texts
        t = list(cluster_data[c].label)
        triples = []
        for i in range(len(t) - 2):
            for j in range(i + 1, len(t) - 1):
                for k in range(i + 2, len(t)):
                    if len(set([t[i], t[j], t[k]])) == 2:
                        if t[i] == t[j]:
                            triples.append(
                                InputExample(texts=[text[i], text[j], text[k]],
                                             label=0))
                        elif t[j] == t[k]:
                            triples.append(
                                InputExample(texts=[text[j], text[k], text[i]],
                                             label=0))
                        else:
                            triples.append(
                                InputExample(texts=[text[i], text[k], text[j]],
                                             label=0))
        frac_triples += random.sample(triples,
                                      len(triples) // num_triples_frac)
    print('No of train triples: %2d' % len(frac_triples))

    return frac_triples
Exemple #5
0
def load_pairwise_data(args, split):
    data = pd.read_csv(
        os.path.join(args.data_dir, "pairwise_pos_%s.csv" % split))
    train_samples = []
    for index, row in data.iterrows():
        train_samples.append(
            InputExample(texts=[row['title_1'], row['title_2']], label=1))
        train_samples.append(
            InputExample(texts=[row['title_2'], row['title_1']], label=1))

    return train_samples
    def setUp(self):
        sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
        if not os.path.exists(sts_dataset_path):
            util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                          sts_dataset_path)

        nli_dataset_path = 'datasets/AllNLI.tsv.gz'
        if not os.path.exists(nli_dataset_path):
            util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz',
                          nli_dataset_path)

        #Read NLI
        label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
        self.nli_train_samples = []
        max_train_samples = 10000
        with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                if row['split'] == 'train':
                    label_id = label2int[row['label']]
                    self.nli_train_samples.append(
                        InputExample(
                            texts=[row['sentence1'], row['sentence2']],
                            label=label_id))
                    if len(self.nli_train_samples) >= max_train_samples:
                        break

        #Read STSB
        self.stsb_train_samples = []
        self.dev_samples = []
        self.test_samples = []
        with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                score = float(
                    row['score']) / 5.0  # Normalize score to range 0 ... 1
                inp_example = InputExample(
                    texts=[row['sentence1'], row['sentence2']], label=score)

                if row['split'] == 'dev':
                    self.dev_samples.append(inp_example)
                elif row['split'] == 'test':
                    self.test_samples.append(inp_example)
                else:
                    self.stsb_train_samples.append(inp_example)
Exemple #7
0
 def examples_for_q_answers(q_text: str,
                            a_texts: List[str],
                            a_dists: List[float] = None):
     if a_dists is None:
         a_dists = [0] * len(a_texts)
     for a_text, a_dist in zip(a_texts, a_dists):
         yield InputExample("Infer_example", [q_text, a_text], a_dist)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate saved models')
    parser.add_argument('-dt', '--data', default='trec')
    parser.add_argument('-in', '--input_dir', default='~/trec_dataset')
    parser.add_argument('-mp', '--model_path')
    parser.add_argument('-lv', '--level', default='t')

    args = parser.parse_args()
    dataset = args.data
    input_dir = args.input_dir
    model_path = args.model_path
    level = args.level
    if dataset == 'trec':
        test_art_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-article.qrels'
        test_top_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-toplevel.qrels'
        test_hier_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-hierarchical.qrels'
        test_paratext = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/by1test_paratext/by1test_paratext.tsv'

        evaluate_treccar(model_path, test_art_qrels, test_top_qrels,
                         test_hier_qrels, test_paratext, level)
    elif dataset == '20ng':
        pages_to_cluster = 50
        ng_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'))
        test_cluster_data = []
        for i in range(len(ng_test['filenames']) // pages_to_cluster):
            test_cluster_data.append(
                InputExample(
                    texts=ng_test['data'][i * pages_to_cluster:(i + 1) *
                                          pages_to_cluster],
                    label=ng_test['target'][i * pages_to_cluster:(i + 1) *
                                            pages_to_cluster]))
        print("Test instances: %5d" % len(test_cluster_data))
        evaluate_ng20(model_path, test_cluster_data)
    def setUp(self):
        sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
        if not os.path.exists(sts_dataset_path):
            util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                          sts_dataset_path)

        #Read STSB
        self.stsb_train_samples = []
        self.dev_samples = []
        self.test_samples = []
        with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                score = float(
                    row['score']) / 5.0  # Normalize score to range 0 ... 1
                inp_example = InputExample(
                    texts=[row['sentence1'], row['sentence2']], label=score)

                if row['split'] == 'dev':
                    self.dev_samples.append(inp_example)
                elif row['split'] == 'test':
                    self.test_samples.append(inp_example)
                else:
                    self.stsb_train_samples.append(inp_example)
Exemple #10
0
    def get_examples(self, language, split='train'):
        pairs = self._load_data()
        if language == 'hi':
            pairs = pairs['hindi_headlines']
        elif language == 'pt':
            pairs = pairs['ciper']
        else:
            pairs = pairs['fact_pairs']
        split_point = int(round(len(pairs) * 0.8))
        if split == 'train':
            pairs = pairs[:split_point]
        else:
            pairs = pairs[split_point:]
        examples = []
        for i, item in enumerate(pairs):
            guid = i
            sentence1 = item['lookup_text']
            sentence2 = item['database_text']
            label = item['label']
            examples.append(
                InputExample(guid=guid,
                             texts=[sentence1, sentence2],
                             label=label))

        return examples
    def get_examples(self, language):
        with open('../data/xnli/xnli.dev.jsonl', 'r') as json_file:
            json_list = list(json_file)
            xnli_data = [json.loads(line) for line in json_list]

        with open('../data/xnli/xnli.test.jsonl', 'r') as json_file:
            json_list = list(json_file)
            xnli_data += [json.loads(line) for line in json_list]

        xnli_data = [
            item for item in xnli_data if item['language'] == language
        ]

        examples = []
        for item in xnli_data:
            guid = item['pairID']
            sentence1 = item['sentence1']
            sentence2 = item['sentence2']
            label = item['gold_label']
            examples.append(
                InputExample(guid=guid,
                             texts=[sentence1, sentence2],
                             label=self.map_label(label)))

        return examples
Exemple #12
0
    def get_examples(self, filename, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        filepath = os.path.join(self.dataset_folder, filename)
        with gzip.open(filepath, 'rt',
                       encoding='utf8') if filename.endswith('.gz') else open(
                           filepath, encoding="utf-8") as fIn:
            data = csv.reader(fIn,
                              delimiter=self.delimiter,
                              quoting=self.quoting)
            examples = []
            for id, row in enumerate(data):
                score = float(row[self.score_col_idx])
                if self.normalize_scores:  # Normalize to a 0...1 value
                    score = (score - self.min_score) / (self.max_score -
                                                        self.min_score)

                s1 = row[self.s1_col_idx]
                s2 = row[self.s2_col_idx]
                examples.append(
                    InputExample(guid=filename + str(id),
                                 texts=[s1, s2],
                                 label=score))

                if max_examples > 0 and len(examples) >= max_examples:
                    break

        return examples
Exemple #13
0
    def get_examples(self):
        bhaav = pd.read_csv(
            "../data/recasted-hindi-nli-data/bhaav/bhaav_recasted.tsv",
            sep="\t")
        bhaav = bhaav.dropna(subset=['entailment'])
        mr = pd.read_csv(
            "../data/recasted-hindi-nli-data/MR/recasted_movie_review_data.tsv",
            sep="\t")
        pr = pd.read_csv(
            "../data/recasted-hindi-nli-data/PR/recasted_product_review_data.tsv",
            sep="\t")

        examples = []
        idx = 0
        for _, item in pd.concat([bhaav, mr, pr]).iterrows():
            guid = idx
            idx += 1
            sentence1 = item['context']
            sentence2 = item['hypothesis']
            label = item['entailment']
            examples.append(
                InputExample(guid=guid,
                             texts=[sentence1, sentence2],
                             label=self.map_label(label)))

        return examples
Exemple #14
0
def triplets_from_labeled_dataset(input_examples):
    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a
    # positive instance with the same label and a negative instance with a different label
    triplets = []
    label2sentence = defaultdict(list)
    for inp_example in input_examples:
        label2sentence[inp_example.label].append(inp_example)

    for inp_example in input_examples:
        anchor = inp_example

        if len(
                label2sentence[inp_example.label]
        ) < 2:  #We need at least 2 examples per label to create a triplet
            continue

        positive = None
        while positive is None or positive.guid == anchor.guid:
            positive = random.choice(label2sentence[inp_example.label])

        negative = None
        while negative is None or negative.label == anchor.label:
            negative = random.choice(input_examples)

        triplets.append(
            InputExample(
                texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets
    def get_examples(self, filename, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        data = csv.reader(open(os.path.join(self.dataset_folder, filename),
                               encoding="utf-8"),
                          delimiter=self.delimiter,
                          quoting=self.quoting)
        examples = []
        for id, row in enumerate(data):

            try:
                score = float(row[self.score_col_idx])
            except:
                print(row[self.score_col_idx])
                continue

            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score -
                                                    self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(
                InputExample(guid=filename + str(id),
                             texts=[s1, s2],
                             label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples
Exemple #16
0
 def get_examples(self, fn):
     examples = []
     for line in open(fn):
         sent1, sent2, label = line.strip().split('\t')
         examples.append(InputExample(guid=self.guid,
             texts=[sent1, sent2],
             label=int(label)))
         self.guid += 1
     return examples
Exemple #17
0
def get_data(data_file):
    train_samples = []
    with open(data_file, "r", encoding="utf-8") as f:
        dataset = json.loads(f.read())
        for cur_dialg in dataset:
            train_samples.append(
                InputExample(texts=[
                    cur_dialg[0].strip().replace(" ", ""),
                    cur_dialg[1].strip().replace(" ", "")
                ],
                             label=1))  # 无论单轮还是多轮,只取最相关的前两句
    return train_samples
Exemple #18
0
def trec_dataset(
    directory="datasets/trec/",
    train_filename="train_5500.label",
    test_filename="TREC_10.label",
    validation_dataset_nb=500,
    urls=[
        "http://cogcomp.org/Data/QA/QC/train_5500.label",
        "http://cogcomp.org/Data/QA/QC/TREC_10.label",
    ],
):

    os.makedirs(directory, exist_ok=True)

    ret = []
    for url, filename in zip(urls, [train_filename, test_filename]):
        full_path = os.path.join(directory, filename)
        urllib.request.urlretrieve(url, filename=full_path)

        examples = []
        label_map = {}
        guid = 1
        for line in open(full_path, "rb"):
            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
            label, _, text = line.replace(b"\xf0",
                                          b" ").strip().decode().partition(" ")

            # We extract the upper category (e.g. DESC from DESC:def)
            label, _, _ = label.partition(":")

            if label not in label_map:
                label_map[label] = len(label_map)

            label_id = label_map[label]
            guid += 1
            examples.append(
                InputExample(guid=guid, texts=[text], label=label_id))
        ret.append(examples)

    train_set, test_set = ret
    dev_set = None

    # Create a dev set from train set
    if validation_dataset_nb > 0:
        dev_set = train_set[-validation_dataset_nb:]
        train_set = train_set[:-validation_dataset_nb]

    # For dev & test set, we return triplets (anchor, positive, negative)
    random.seed(42)  #Fix seed, so that we always get the same triplets
    dev_triplets = triplets_from_labeled_dataset(dev_set)
    test_triplets = triplets_from_labeled_dataset(test_set)

    return train_set, dev_triplets, test_triplets
Exemple #19
0
def examples_from_questions_tup(questions: Iterable[Tuple[int, Question]]):
    for q_i, q in questions:
        if q_i % 10000 == 0:
            print("Loading %s" % q_i)
        if q.answers is None:
            continue
        all_q_upvotes = [a.score for a in q.answers]
        all_q_dists = upvotes_to_distance(all_q_upvotes)
        if np.isnan(all_q_dists).any():
            # skip the votes with equally-rated answers: these are mostly 0-votes: we do not know anything about
            continue
        for a_i, a in enumerate(q.answers):
            yield InputExample("%s_%s" % (q_i, a_i), [q.body, a.body], all_q_dists[a_i])
def get_data(data_file):
    train_samples = []
    discard_num = 0
    with open(data_file, "r", encoding="utf-8") as f:
        dataset = json.loads(f.read())
        for cur_dialg in dataset:
            query_sent = data_clean(cur_dialg[0])
            content_sent = data_clean(cur_dialg[1])
            if len(query_sent) == 0 or len(content_sent) == 0:
                discard_num += 1
                continue
            else:
                train_samples.append(InputExample(texts=[query_sent, content_sent], label=1))   # 无论单轮还是多轮,只取最相关的前两句
    return train_samples
Exemple #21
0
def trec_dataset(
    directory="datasets/trec/",
    train_filename="train_5500.label",
    test_filename="TREC_10.label",
    validation_dataset_nb=500,
    urls=[
        "http://cogcomp.org/Data/QA/QC/train_5500.label",
        "http://cogcomp.org/Data/QA/QC/TREC_10.label",
    ],
):

    os.makedirs(directory, exist_ok=True)

    ret = []
    for url, filename in zip(urls, [train_filename, test_filename]):
        full_path = os.path.join(directory, filename)
        urllib.request.urlretrieve(url, filename=full_path)

        examples = []
        label_map = {}
        guid = 0
        for line in open(full_path, "rb"):
            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
            label, _, text = line.replace(b"\xf0",
                                          b" ").strip().decode().partition(" ")

            # We extract the upper category (e.g. DESC from DESC:def)
            label, _, _ = label.partition(":")

            if label not in label_map:
                label_map[label] = len(label_map)

            guid += 1
            label_id = label_map[label]
            examples.append(
                InputExample(guid=guid, texts=[text], label=label_id))
        ret.append(examples)

    # Validation dataset:
    # It doesn't exist in the original dataset,
    # so we create one by splitting the train data
    # Ret[0] is train
    # Ret[1] is test
    # Ret[2] is val
    if validation_dataset_nb > 0:
        ret.append(ret[0][-validation_dataset_nb:])
        ret[0] = ret[0][:-validation_dataset_nb]

    return ret
Exemple #22
0
def get_pairs(cluster_data, balanced):
    pairs = []
    if balanced:
        print('Going to balance the datasets')
    for c in trange(len(cluster_data)):
        text = cluster_data[c].texts
        t = list(cluster_data[c].label)
        pos_pairs, neg_pairs = [], []
        for i in range(len(t) - 1):
            for j in range(i + 1, len(t)):
                if t[i] == t[j]:
                    pos_pairs.append(
                        InputExample(texts=[text[i], text[j]], label=1))
                else:
                    neg_pairs.append(
                        InputExample(texts=[text[i], text[j]], label=0))
        if balanced:
            neg_pairs = random.sample(neg_pairs, len(pos_pairs))
        pairs_loc = pos_pairs + neg_pairs
        random.shuffle(pairs_loc)
        pairs += pairs_loc
    print('No of train pairs: %2d' % len(pairs))

    return pairs
def trec_dataset(
    directory="datasets/trec/",
    train_filename="train_5500.label",
    test_filename="TREC_10.label",
    validation_dataset_nb=500,
    urls=[
        "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label",
        "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label",
    ],
):
    os.makedirs(directory, exist_ok=True)

    ret = []
    for url, filename in zip(urls, [train_filename, test_filename]):
        full_path = os.path.join(directory, filename)
        if not os.path.exists(full_path):
            util.http_get(url, full_path)

        examples = []
        label_map = {}
        for guid, line in enumerate(open(full_path, "rb"), start=2):
            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
            label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ")

            if label not in label_map:
                label_map[label] = len(label_map)

            label_id = label_map[label]
            examples.append(InputExample(guid=guid, texts=[text], label=label_id))
        ret.append(examples)

    train_set, test_set = ret
    dev_set = None

    # Create a dev set from train set
    if validation_dataset_nb > 0:
        dev_set = train_set[-validation_dataset_nb:]
        train_set = train_set[:-validation_dataset_nb]

    # For dev & test set, we return triplets (anchor, positive, negative)
    random.seed(42) #Fix seed, so that we always get the same triplets
    dev_triplets = triplets_from_labeled_dataset(dev_set)
    test_triplets = triplets_from_labeled_dataset(test_set)

    return train_set, dev_triplets, test_triplets
Exemple #24
0
def read_dataset(train_data_path):

    data = csv.reader(open(os.path.join(train_data_path), encoding="utf-8"),
                      delimiter="\t",
                      quoting=csv.QUOTE_NONE)

    label_map = {}
    train_set = []
    guid = 0
    for line in data:
        # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
        text, label = line
        if label not in label_map:
            label_map[label] = len(label_map)

        label_id = label_map[label]
        guid += 1
        train_set.append(InputExample(guid=guid, texts=[text], label=label_id))

    return train_set
    def get_examples(self, max_examples: int = None):
        """Get a set of examples as required by SentencesDataset.

        :param max_examples: number of samples to return, defaults to None
        :return: InputExample object
        """
        if max_examples is None:
            max_examples = self.df.shape[0]
        s1 = self.df["sentence1"].iloc[:max_examples].values
        s2 = self.df["sentence2"].iloc[:max_examples].values
        labels = self.df["label"].astype(int).iloc[:max_examples].values
        examples = []
        for guid_id, (sentence_a, sentence_b, label) in enumerate(
                zip(s1, s2, labels)):
            examples.append(
                InputExample(
                    guid=guid_id,
                    texts=[
                        sentence_a,
                        sentence_b],
                    label=label))
        return examples
Exemple #26
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout
    # Read the dataset
    train_batch_size = 64
    num_epochs = 1000

    if args.pretrained:
        model = SentenceTransformer(args.pretrained)
        model_save_path = os.path.join(
            args.save_path,
            args.pretrained.split("/")[-1] + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    else:
        #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
        model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
        model_save_path = os.path.join(
            args.save_path,
            model_name.replace("/", "-") + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read custom train dataset")

    train_samples = []
    val_samples = []
    inp_list = []
    dataset_path = args.data_path
    with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 10  # Normalize score to range 0 ... 1
            inp_list.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

    from sklearn.model_selection import train_test_split
    train_samples, val_samples = train_test_split(inp_list, test_size=0.2)
    # import ipdb; ipdb.set_trace()

    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read custom dev dataset")
    # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev')
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_dataset) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # import ipdb; ipdb.set_trace()
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Exemple #27
0
logging.info(
    "Step 1: Train cross-encoder: ({}) with STSbenchmark".format(model_name))

gold_samples = []
dev_samples = []
test_samples = []

with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        if row['split'] == 'dev':
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))
        elif row['split'] == 'test':
            test_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))
        else:
            #As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
            gold_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))
            gold_samples.append(
                InputExample(texts=[row['sentence2'], row['sentence1']],
                             label=score))

# We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(gold_samples,
Exemple #28
0
        url=
        'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip',
        path=zip_save_path)
    with ZipFile(zip_save_path, 'r') as zip:
        zip.extractall(dataset_path)

######### Read train data  ##########
train_samples_MultipleNegativesRankingLoss = []
train_samples_ConstrativeLoss = []

with open(os.path.join(dataset_path, "classification/train_pairs.tsv"),
          encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        train_samples_ConstrativeLoss.append(
            InputExample(texts=[row['question1'], row['question2']],
                         label=int(row['is_duplicate'])))
        if row['is_duplicate'] == '1':
            train_samples_MultipleNegativesRankingLoss.append(
                InputExample(texts=[row['question1'], row['question2']],
                             label=1))
            train_samples_MultipleNegativesRankingLoss.append(
                InputExample(texts=[row['question2'], row['question1']],
                             label=1)
            )  # if A is a duplicate of B, then B is a duplicate of A

# Create data loader and loss for MultipleNegativesRankingLoss
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(
    train_samples_MultipleNegativesRankingLoss, model=model)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(
    train_dataset_MultipleNegativesRankingLoss,
    shuffle=True,
ranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Base loss
from sentence_transformers import SentencesDataset, losses
from sentence_transformers.readers import InputExample

examples = []

for topic in topics:
    gold = qrel[topic["number"]].items()
    query = topic["title"].strip()

    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
        except:
            continue
print("finished", len(examples))

#%%
from torch.utils.data import DataLoader
train_dataset = SentencesDataset(examples, ranker)
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.OnlineContrastiveLoss(model=ranker)

ranker.fit(train_dataloader=train_dl,
           epochs=20,
           output_path="ranker/constrastive_loss/",
           save_best_model=True)
Exemple #30
0
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read STSbenchmark train dataset")

    train_samples = []
    dev_samples = []
    test_samples = []
    with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 5.0  # Normalize score to range 0 ... 1
            inp_example = InputExample(
                texts=[row['sentence1'], row['sentence2']], label=score)

            if row['split'] == 'dev':
                dev_samples.append(inp_example)
            elif row['split'] == 'test':
                test_samples.append(inp_example)
            else:
                train_samples.append(inp_example)

    train_dataloader = DataLoader(train_samples,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read STSbenchmark dev dataset")
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(