Exemple #1
0
def get_other_results(queries, qml_rankings, num_ranks=None):
    document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache('tok_docs.json',
                                     lambda: tokenizer.process_all(documents))
    tokenized_queries = tokenizer.process_all(queries)
    bm25 = BM25(tokenized_documents)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
    bm25_rankings = []
    glove_rankings = []
    rm3_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    docs_lms = _calc_docs_lms(bm25.df, bm25.f)
    for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)):
        bm25_rankings.append(
            _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf))
        glove_rankings.append(
            _get_glove_ranking(glove, tokenized_documents, qml_ranking, q))
        rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q))
    return bm25_rankings, glove_rankings, rm3_rankings
Exemple #2
0
def main():
  document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
  document_title_to_id = create_id_lookup(document_lookup.keys())
  document_id_to_title = _.invert(document_title_to_id)
  doc_ids = range(len(document_id_to_title))
  documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids]
  tokenizer = Tokenizer()
  tokenized_documents = read_cache('tok_docs.json',
                                   lambda: tokenizer.process_all(documents))
  bm25 = BM25(tokenized_documents)
  with open('./doc_word_idf.json', 'w+') as fh:
    json.dump(bm25.idf, fh)
Exemple #3
0
def get_texts(texts, with_preprocess=False, pre_rules=None):
    r"""
    Uses fastai's Tokenizer because it does a series of very convenients things
    during the tokenization process
    See here: https://docs.fast.ai/text.transform.html#Tokenizer
    """
    tok_func = Tokenizer()
    if with_preprocess:
        texts = [" ".join(simple_preprocess(s)) for s in texts]
    if pre_rules:
        tok_func.pre_rules = pre_rules + tok_func.pre_rules
    tokens = tok_func.process_all(texts)
    return tokens
Exemple #4
0
def baselines_eval():
    rankings_to_eval = read_query_test_rankings()
    qrels = parse_qrels()
    query_ids = list(qrels.keys())
    query_lookup = get_robust_eval_queries()
    queries = [query_lookup[query_id] for query_id in query_ids]
    k = 10 if len(sys.argv) == 1 else int(sys.argv[1])
    document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']),
                                 get_robust_documents_with_titles)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    ordered_rankings_to_eval = [[
        document_title_to_id[title] for title in rankings_to_eval[query]
    ] for query in query_ids]
    ordered_qrels = [[document_title_to_id[title] for title in qrels[query]]
                     for query in query_ids]
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache(
        'tok_docs.json',
        lambda: tokenizer.process_all(clean_documents(documents)))
    tokenized_queries = tokenizer.process_all(clean_documents(queries))
    bm25 = gensim_bm25.BM25(tokenized_documents)
    # with open('./caches/106756_most_common_doc.json', 'r') as fh:
    #   doc_token_set = set(json.load(fh))
    # corpus, token_lookup = tokens_to_indexes(tokenized_documents,
    #                                          None,
    #                                          token_set=doc_token_set)
    # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f]
    # tfidf = TfidfModel(corpus)
    # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300)
    glove_rankings = []
    # lsi_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    encoded_docs = torch.stack(
        [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f])
    encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1)
    for q, qml_ranking in progressbar(zip(tokenized_queries,
                                          ordered_rankings_to_eval),
                                      max_value=len(tokenized_queries)):
        doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None
        glove_rankings.append(
            rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids))
        # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids))
    print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k))
    print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))
def get_preds(user_input, vectorizer, m):
    tokenizer = Tokenizer()
    tok = SpacyTokenizer('en')
    toks = tokenizer.process_text(user_input, tok)
    term_doc = vectorizer.transform(toks)
    preds = m.predict(term_doc)
    toks_preds_df = pd.DataFrame()
    toks_preds_df['toks'] = toks
    toks_preds_df['preds'] = preds

    html_format_list = html_format(toks_preds_df)
    html_format_join = ' '.join(html_format_list)
    html_format_join = "<div>" + html_format_join + "</span></div>"
    return html_format_join
Exemple #6
0
def preprocess_texts(texts,
                     token_lookup=None,
                     num_tokens=None,
                     token_set=None,
                     drop_if_any_unk=False):
    tokenizer = Tokenizer()
    tokenized = tokenizer.process_all(texts)
    idx_texts, token_lookup = tokens_to_indexes(
        tokenized,
        token_lookup,
        num_tokens=num_tokens,
        token_set=token_set,
        drop_if_any_unk=drop_if_any_unk)
    return idx_texts, token_lookup
Exemple #7
0
def get_texts(df):
    labels = df[0].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[1].astype(str)
    texts = list(texts.apply(fixup).values)

    tokens = Tokenizer(lang='en_core_web_sm').proc_all_mp(
        partition_by_cores(texts), lang='en_core_web_sm')
    return tokens, list(labels)
Exemple #8
0
def prepare_clas_dataset(input_path,
                         output_dir=None,
                         valid_split=0.2,
                         tokenizer_lang="xx",
                         min_freq=2,
                         seed=42):
    """
    Reads a CSV file with texts and labels, splits it into training and validation sets,
    tokenizes texts and saves datasets for fine-tuning and for classification.

    Attributes:
        input_path (str): Path to CSV file with texts in the first and labels in second column.
        output_dir (str): Folder where to store the processed dataset.
        valid_split (float): A fraction of data used for validation.
        tokenizer_lang (str): Language setting for tokenizer.
        min_freq (int): Minimal number of occurrences of a word to be conidered for adding to
            vocabulary.
        seed (int): Random seed that determines the training-validation split.
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir or input_path.parent)
    output_dir.mkdir(parents=True, exist_ok=True)

    train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed)

    data_finetune_lm = TextLMDataBunch.from_df(
        output_dir,
        train_df,
        valid_df,
        tokenizer=Tokenizer(lang=tokenizer_lang),
        text_cols=0,
        min_freq=min_freq)
    data_clas = TextClasDataBunch.from_df(
        output_dir,
        train_df,
        valid_df,
        tokenizer=Tokenizer(lang=tokenizer_lang),
        text_cols=0,
        label_cols=1,
        vocab=data_finetune_lm.train_ds.vocab,
        bs=32,
        min_freq=min_freq)

    data_finetune_lm.save("data_finetune_lm.pkl")
    data_clas.save("data_clas.pkl")
def get_german_db(bs=32):
    tokenizer = Tokenizer(lang='de')
    tokenizer_proc = TokenizeProcessor(tokenizer=tokenizer)
    num_proc = NumericalizeProcessor()
    processor = [tokenizer_proc, num_proc]
    db = TextList\
        .from_csv('/data/10kgerman', 'train.csv', cols=["text"], processor=processor)\
        .split_by_rand_pct(0.1)\
        .label_from_df('c').databunch()
    db.batch_size = bs
    return db
Exemple #10
0
def get_texts_and_tokenize(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values
    labels = labels.tolist()
    labels = list(map(lambda x: ast.literal_eval(x[0]), labels))
    texts = f'\n{BOS} ' + df[n_lbls].astype(str)

    # perform common fixup to text:
    texts = list(texts.apply(fixup).values)
    labels = list(labels)

    # tokenize texts
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return texts, tok, labels
Exemple #11
0
    def predict(self, text):
        self.model.reset()
        self.model.eval()

        input_string = 'xbos xfld 1 ' + text
        texts = [input_string]
        tokens = Tokenizer(lang='en_core_web_sm').proc_all_mp(
            partition_by_cores(texts), lang='en_core_web_sm')
        encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]]
        token_array = np.reshape(np.array(encoded_tokens), (-1, 1))
        token_array = Variable(torch.from_numpy(token_array))
        prediction_scores = self.model(token_array)
        prediction_scores_numpy = prediction_scores[0].data.cpu().numpy()

        return numpy_softmax(prediction_scores_numpy[0])[0]
Exemple #12
0
 def __init__(self, model_name):
     super().__init__()
     setattr(self, 'model_class', AutoModelForSequenceClassification)
     self.model_name = model_name
     self.pt_tokenizer, self.pt_config, self.pt_model = [
         self.fetch_pretrained(i) for i in
         [self.tokenizer_class, self.config_class, self.model_class]
     ]
     self.base_tokenizer = transformer_tokenizer.TransformersBaseTokenizer(
         self.model_name, self.pt_tokenizer)
     self.tokenizer = Tokenizer(tok_func=self.base_tokenizer,
                                pre_rules=[],
                                post_rules=[])
     self.vocab = transformer_vocab.TransformersVocab(self.pt_tokenizer)
     self.model = CustTransformerModel(self.pt_model)
    def __init__(self, path):
        texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv',
                            index_col=0)
        texts.dropna(inplace=True)
        data = TextList.from_df(texts,
                        processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")),
                                     NumericalizeProcessor(min_freq=2, max_vocab=60000)])\
                                    .split_by_rand_pct(.1)\
                                    .label_for_lm()\
                                    .databunch(bs=64)

        self.learn = language_model_learner(data=data,
                                            arch=AWD_LSTM,
                                            pretrained=None)
        self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth',
                                   path + '/ulmfit/bestmodel_tune_itos.pkl')
Exemple #14
0
def evaluate_lm(data_path,
                model_dir,
                tokenizer_lang="xx",
                evaluate_custom_perplexity=False):
    """
    Evaluate metrics of a trained language model using any dataset of texts from CSV file.

    Attributes:
        data_path (str): Path to CSV file with texts in the first column.
        model_dir (str): Directory with a trained language model.
        tokenizer_lang (str): Language setting for tokenizer.
        evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss),
            but the average loss changes slightly with batch size. To get perplexity computed in
            slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy
            between perplexity and custom perplexity is empirically approximately 1%.
    """
    model_dir = Path(model_dir)
    with open(model_dir / "lm_itos.pkl", "rb") as f:
        itos = pickle.load(f)

    data_df = pd.read_csv(data_path, header=None)
    data = TextLMDataBunch.from_df("",
                                   data_df,
                                   data_df,
                                   text_cols=0,
                                   tokenizer=Tokenizer(lang=tokenizer_lang),
                                   vocab=Vocab(itos))

    with open(model_dir / "model_hparams.json", "r") as model_hparams_file:
        model_hparams = json.load(model_hparams_file)
    learner = lm_learner(data,
                         AWD_LSTM,
                         model_dir,
                         pretrained=True,
                         config=model_hparams)

    loss, acc = learner.validate()
    print("Loss: {}, Perplexity: {}, Accuracy: {}".format(
        loss, exp(loss), acc))
    if evaluate_custom_perplexity:
        print(
            "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}"
            .format(*evaluate_perplexity(learner, data.valid_ds.x)))
Exemple #15
0
def label_linkable_tokens(article_html, tokenizer=Tokenizer(), label_all=True):
    parsed_html = BeautifulSoup(article_html, 'html.parser')

    link_text = [link.text for link in parsed_html.find_all('a')]
    tokenised_links = tokenizer.process_all(link_text)

    tokenised_text = tokenizer.process_all([parsed_html.text])[0]
    
    target = np.zeros(len(tokenised_text))
    
    for link in tokenised_links:
        start_positions = kmp(tokenised_text, link)
        if label_all:            
            for pos in start_positions:
                target[pos : pos + len(link)] = 1
        elif label_all == False and len(start_positions) > 0:
            pos = start_positions[0]
            target[pos : pos + len(link)] = 1
        else: 
            pass

    return tokenised_text, target
Exemple #16
0
def prepare_lm_dataset(input_path,
                       output_dir=None,
                       valid_split=0.2,
                       tokenizer_lang="xx",
                       min_freq=2,
                       seed=42):
    """
    Reads CSV file with texts for training language model, splits it into training and validation
    sets, tokenizes and saves the dataset.

    Attributes:
        input_path (str): Path to CSV file where there are texts in the first column.
        output_dir (str): Folder where to store the processed dataset.
        valid_split (float): A fraction of data used for validation.
        tokenizer_lang (str): Language setting for tokenizer.
        min_freq (int): Minimal number of occurrences of a word to be conidered for adding to
            vocabulary.
        seed (int): Random seed that determines the training-validation split.
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir or input_path.parent)
    output_dir.mkdir(parents=True, exist_ok=True)

    train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed)

    data_lm = TextLMDataBunch.from_df(output_dir,
                                      train_df,
                                      valid_df,
                                      text_cols=0,
                                      tokenizer=Tokenizer(lang=tokenizer_lang),
                                      min_freq=min_freq)
    data_lm.save("data_lm.pkl")

    with open(output_dir / "data_lm_tokenized_train.txt", "w") as f:
        f.write("\n".join(map(str, list(data_lm.train_ds.x))))
    with open(output_dir / "data_lm_tokenized_valid.txt", "w") as f:
        f.write("\n".join(map(str, list(data_lm.valid_ds.x))))
Exemple #17
0
def get_processor(vocab_class_obj):
    if conf['use_sentencePiece']:
        processor = [
            OpenFileProcessor(),
            SPProcessor(
                sp_model=conf['sentencePiece']['model'],
                sp_vocab=conf['sentencePiece']['vocab'],
                max_sentence_len=conf['sentencePiece']['max_sentence_len'],
                max_vocab_sz=conf['sentencePiece']['max_vocab'])
        ]
    else:
        tokenizer = Tokenizer(tok_func=dna_tokenizer_n_char,
                              pre_rules=[],
                              post_rules=[],
                              special_cases=[],
                              n_cpus=conf['n_workers'])
        processor = [
            TokenizeProcessor(tokenizer=tokenizer,
                              include_bos=True,
                              include_eos=True),
            NumericalizeProcessor(vocab=vocab_class_obj,
                                  max_vocab=conf['max_vocab'])
        ]
    return processor, tokenizer
import csv
import multiprocessing as mp
import typing as t
from pathlib import Path

import joblib
from fastai.text import Tokenizer, SpacyTokenizer, Vocab
from tqdm import tqdm

# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = Tokenizer()
tok = SpacyTokenizer('en')


def tokenize(line: str) -> str:
    doc = ' '.join(tokenizer.process_text(line, tok))  # parse with FastAI tokenizer
    return doc + '\n'


def numericalize(doc: str, vocab: Vocab) -> t.List[int]:
    return vocab.numericalize(doc)


def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab:
    return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)


if __name__ == '__main__':
    data_path = Path('../data/raw/reviews.csv')
    assert data_path.exists()
def get_texts_gensim(texts: List[str]) -> List[List[str]]:
    processed_textx = [' '.join(simple_preprocess(t)) for t in texts]
    tok = Tokenizer().process_all(processed_textx)
    return tok
def get_texts_fastai(texts: List[str]) -> List[List[str]]:
    fixed_texts = [fix_html(t) for t in texts]
    tok = Tokenizer().process_all(fixed_texts)
    return tok
Exemple #21
0
 def _apply_tokenizer(self, text):
     # Tokenizer creates a list, containing a list of each word pairs, such as
     #tokens = [[' ', 't_up', 'bos'], ['senjō'], ['no'], [' ', 't_up', 'eos']]
     tokens = Tokenizer(lang=self.lang).proc_all_mp(partition_by_cores(text.split()))
     # Flatten nested list
     return [word for sublist in tokens for word in sublist]
Exemple #22
0
def get_default_tokenizer():
    return Tokenizer()
Exemple #23
0
def main(gpu: Param("GPU to run on", str)=None,
        max_cpu_per_dataloader: Param("Max CPU", int, opt=True)=8,
        bs: Param("batch size", int)=256,
        fp16: Param("mixed precision", int, opt=True)=0,
         use_sp_processor: Param("use sentence piece as processor", int) = 0,
        sp_model: Param("sentence piece trained model file", str)=None,
        sp_vocab: Param("sentence piece trained model file", str)=None,
    ):
    datetime_str = f'{datetime.now():%Y-%m-%d_%H-%M-%S%z}'
    random_seed = 0
    max_vocab = 30000
    print('max_cpu_per_dataloader', max_cpu_per_dataloader, 'bs', bs,
        'fp16', fp16, 'sp_processor', use_sp_processor, 'sp_model', sp_model, 'sp_vocab', sp_vocab)
    """## Prepare Dataset"""
    local_project_path = './data/sprot_lm/'

    #### Distributed
    print('gpu', gpu)
    gpu = setup_distrib(gpu)
    n_gpus = num_distrib()
    if n_gpus > 0:
        workers = min(max_cpu_per_dataloader, num_cpus()//n_gpus)
    else:
        workers = min(max_cpu_per_dataloader, num_cpus())
    print(gpu, 'n_gpus', n_gpus)
    print(gpu, 'workers', workers)


    """## Prepare fastai"""
    np.random.seed(random_seed)

    if not os.path.exists(local_project_path):
        os.makedirs(local_project_path)
    print('local_project_path:', local_project_path)

    """## Tokenization"""
    tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[],
                        post_rules=[], special_cases=[])
    processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos=True,
                                include_eos=True), NumericalizeProcessor(max_vocab=max_vocab)]
    df = pickle.load(
        open('./data/sprot_lm/sprot_sequence_taxon_anc.pickle', 'rb'))

    if use_sp_processor: # './data/sprot_lm/tmp/spm.model', './data/sprot_lm/tmp/spm.vocab'
        processor = [OpenFileProcessor(), SPProcessor(sp_model=sp_model, sp_vocab=sp_vocab, max_sentence_len=35826, max_vocab_sz=max_vocab)]
    data_lm = (TextList.from_df(df, path=local_project_path, cols='seq_anc_tax', processor=processor)
                    .split_by_rand_pct(0.1, seed = random_seed)
                    .label_for_lm()
                    .databunch(bs=bs, num_workers=workers))

    data_lm.vocab.save(local_project_path +
                       'vocab_lm_sprot_seq_anc_tax-' + datetime_str + '.pickle')

    print('data_cls Training set size', len(data_lm.train_ds))
    print('data_cls Validation set size', len(data_lm.valid_ds))
    print('vocab size ', len(data_lm.vocab.itos))


    learn_lm = language_model_learner(
        data_lm, AWD_LSTM, drop_mult=0.1, pretrained=False)

    if gpu is None:
        print(gpu, 'DataParallel')
        learn_lm.model = nn.DataParallel(learn_lm.model)
    else:
        print(gpu, 'to_distributed')
        learn_lm.to_distributed(gpu)
        if fp16:
            learn_lm.to_fp16()
    

    lr = 3e-3
    print(gpu, 'freeze')
    learn_lm.freeze()
    learn_lm.fit_one_cycle(1, lr, moms=(0.8, 0.7))  # I don't know why multigpu doesn't work without first freezing

    print(gpu, 'unfreeze')
    learn_lm.unfreeze()
    learn_lm.fit_one_cycle(10, lr*10, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-1-' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-1-enc-' + datetime_str)
    
    learn_lm.fit_one_cycle(10, lr, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-2-' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-2-enc-' + datetime_str)

    learn_lm.fit_one_cycle(10, lr/10, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-3' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-3-enc-' + datetime_str)
    learn_lm.export(file = 'export-lm-sp-ans-v1-3' + datetime_str+ '.pkl')
    print('Done')
Exemple #24
0
def main():
    global model_to_save
    global experiment
    global rabbit
    rabbit = MyRabbit(args)
    if rabbit.model_params.dont_limit_num_uniq_tokens:
        raise NotImplementedError()
    if rabbit.model_params.frame_as_qa: raise NotImplementedError
    if rabbit.run_params.drop_val_loss_calc: raise NotImplementedError
    if rabbit.run_params.use_softrank_influence and not rabbit.run_params.freeze_all_but_last_for_influence:
        raise NotImplementedError
    if rabbit.train_params.weight_influence: raise NotImplementedError
    experiment = Experiment(rabbit.train_params + rabbit.model_params +
                            rabbit.run_params)
    print('Model name:', experiment.model_name)
    use_pretrained_doc_encoder = rabbit.model_params.use_pretrained_doc_encoder
    use_pointwise_loss = rabbit.train_params.use_pointwise_loss
    query_token_embed_len = rabbit.model_params.query_token_embed_len
    document_token_embed_len = rabbit.model_params.document_token_embed_len
    _names = []
    if not rabbit.model_params.dont_include_titles:
        _names.append('with_titles')
    if rabbit.train_params.num_doc_tokens_to_consider != -1:
        _names.append('num_doc_toks_' +
                      str(rabbit.train_params.num_doc_tokens_to_consider))
    if not rabbit.run_params.just_caches:
        if rabbit.model_params.dont_include_titles:
            document_lookup = read_cache(name('./doc_lookup.json', _names),
                                         get_robust_documents)
        else:
            document_lookup = read_cache(name('./doc_lookup.json', _names),
                                         get_robust_documents_with_titles)
    num_doc_tokens_to_consider = rabbit.train_params.num_doc_tokens_to_consider
    document_title_to_id = read_cache(
        './document_title_to_id.json',
        lambda: create_id_lookup(document_lookup.keys()))
    with open('./caches/106756_most_common_doc.json', 'r') as fh:
        doc_token_set = set(json.load(fh))
        tokenizer = Tokenizer()
        tokenized = set(
            sum(
                tokenizer.process_all(list(
                    get_robust_eval_queries().values())), []))
        doc_token_set = doc_token_set.union(tokenized)
    use_bow_model = not any([
        rabbit.model_params[attr] for attr in
        ['use_doc_out', 'use_cnn', 'use_lstm', 'use_pretrained_doc_encoder']
    ])
    use_bow_model = use_bow_model and not rabbit.model_params.dont_use_bow
    if use_bow_model:
        documents, document_token_lookup = read_cache(
            name(f'./docs_fs_tokens_limit_uniq_toks_qrels_and_106756.pkl',
                 _names),
            lambda: prepare_fs(document_lookup,
                               document_title_to_id,
                               num_tokens=num_doc_tokens_to_consider,
                               token_set=doc_token_set))
        if rabbit.model_params.keep_top_uniq_terms is not None:
            documents = [
                dict(
                    nlargest(rabbit.model_params.keep_top_uniq_terms,
                             _.to_pairs(doc), itemgetter(1)))
                for doc in documents
            ]
    else:
        documents, document_token_lookup = read_cache(
            name(
                f'./parsed_docs_{num_doc_tokens_to_consider}_tokens_limit_uniq_toks_qrels_and_106756.json',
                _names), lambda: prepare(document_lookup,
                                         document_title_to_id,
                                         num_tokens=num_doc_tokens_to_consider,
                                         token_set=doc_token_set))
    if not rabbit.run_params.just_caches:
        train_query_lookup = read_cache('./robust_train_queries.json',
                                        get_robust_train_queries)
        train_query_name_to_id = read_cache(
            './train_query_name_to_id.json',
            lambda: create_id_lookup(train_query_lookup.keys()))
    train_queries, query_token_lookup = read_cache(
        './parsed_robust_queries_dict.json',
        lambda: prepare(train_query_lookup,
                        train_query_name_to_id,
                        token_lookup=document_token_lookup,
                        token_set=doc_token_set,
                        drop_if_any_unk=True))
    query_tok_to_doc_tok = {
        idx: document_token_lookup.get(query_token)
        or document_token_lookup['<unk>']
        for query_token, idx in query_token_lookup.items()
    }
    names = [RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]]
    if rabbit.train_params.use_pointwise_loss or not rabbit.run_params.just_caches:
        train_data = read_cache(
            name('./robust_train_query_results_tokens_qrels_and_106756.json',
                 names), lambda: read_query_result(
                     train_query_name_to_id,
                     document_title_to_id,
                     train_queries,
                     path='./indri/query_result' + RANKER_NAME_TO_SUFFIX[
                         rabbit.train_params.ranking_set]))
    else:
        train_data = []
    q_embed_len = rabbit.model_params.query_token_embed_len
    doc_embed_len = rabbit.model_params.document_token_embed_len
    if rabbit.model_params.append_difference or rabbit.model_params.append_hadamard:
        assert q_embed_len == doc_embed_len, 'Must use same size doc and query embeds when appending diff or hadamard'
    if q_embed_len == doc_embed_len:
        glove_lookup = get_glove_lookup(
            embedding_dim=q_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
        q_glove_lookup = glove_lookup
        doc_glove_lookup = glove_lookup
    else:
        q_glove_lookup = get_glove_lookup(
            embedding_dim=q_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
        doc_glove_lookup = get_glove_lookup(
            embedding_dim=doc_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
    num_query_tokens = len(query_token_lookup)
    num_doc_tokens = len(document_token_lookup)
    doc_encoder = None
    if use_pretrained_doc_encoder or rabbit.model_params.use_doc_out:
        doc_encoder, document_token_embeds = get_doc_encoder_and_embeddings(
            document_token_lookup, rabbit.model_params.only_use_last_out)
        if rabbit.model_params.use_glove:
            query_token_embeds_init = init_embedding(q_glove_lookup,
                                                     query_token_lookup,
                                                     num_query_tokens,
                                                     query_token_embed_len)
        else:
            query_token_embeds_init = from_doc_to_query_embeds(
                document_token_embeds, document_token_lookup,
                query_token_lookup)
        if not rabbit.train_params.dont_freeze_pretrained_doc_encoder:
            dont_update(doc_encoder)
        if rabbit.model_params.use_doc_out:
            doc_encoder = None
    else:
        document_token_embeds = init_embedding(doc_glove_lookup,
                                               document_token_lookup,
                                               num_doc_tokens,
                                               document_token_embed_len)
        if rabbit.model_params.use_single_word_embed_set:
            query_token_embeds_init = document_token_embeds
        else:
            query_token_embeds_init = init_embedding(q_glove_lookup,
                                                     query_token_lookup,
                                                     num_query_tokens,
                                                     query_token_embed_len)
    if not rabbit.train_params.dont_freeze_word_embeds:
        dont_update(document_token_embeds)
        dont_update(query_token_embeds_init)
    else:
        do_update(document_token_embeds)
        do_update(query_token_embeds_init)
    if rabbit.train_params.add_rel_score:
        query_token_embeds, additive = get_additive_regularized_embeds(
            query_token_embeds_init)
        rel_score = RelScore(query_token_embeds, document_token_embeds,
                             rabbit.model_params, rabbit.train_params)
    else:
        query_token_embeds = query_token_embeds_init
        additive = None
        rel_score = None
    eval_query_lookup = get_robust_eval_queries()
    eval_query_name_document_title_rels = get_robust_rels()
    test_query_names = []
    val_query_names = []
    for query_name in eval_query_lookup:
        if len(val_query_names) >= 50: test_query_names.append(query_name)
        else: val_query_names.append(query_name)
    test_query_name_document_title_rels = _.pick(
        eval_query_name_document_title_rels, test_query_names)
    test_query_lookup = _.pick(eval_query_lookup, test_query_names)
    test_query_name_to_id = create_id_lookup(test_query_lookup.keys())
    test_queries, __ = prepare(test_query_lookup,
                               test_query_name_to_id,
                               token_lookup=query_token_lookup)
    eval_ranking_candidates = read_query_test_rankings(
        './indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    test_candidates_data = read_query_result(
        test_query_name_to_id,
        document_title_to_id,
        dict(zip(range(len(test_queries)), test_queries)),
        path='./indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    test_ranking_candidates = process_raw_candidates(test_query_name_to_id,
                                                     test_queries,
                                                     document_title_to_id,
                                                     test_query_names,
                                                     eval_ranking_candidates)
    test_data = process_rels(test_query_name_document_title_rels,
                             document_title_to_id, test_query_name_to_id,
                             test_queries)
    val_query_name_document_title_rels = _.pick(
        eval_query_name_document_title_rels, val_query_names)
    val_query_lookup = _.pick(eval_query_lookup, val_query_names)
    val_query_name_to_id = create_id_lookup(val_query_lookup.keys())
    val_queries, __ = prepare(val_query_lookup,
                              val_query_name_to_id,
                              token_lookup=query_token_lookup)
    val_candidates_data = read_query_result(
        val_query_name_to_id,
        document_title_to_id,
        dict(zip(range(len(val_queries)), val_queries)),
        path='./indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    val_ranking_candidates = process_raw_candidates(val_query_name_to_id,
                                                    val_queries,
                                                    document_title_to_id,
                                                    val_query_names,
                                                    eval_ranking_candidates)
    val_data = process_rels(val_query_name_document_title_rels,
                            document_title_to_id, val_query_name_to_id,
                            val_queries)
    train_normalized_score_lookup = read_cache(
        name('./train_normalized_score_lookup.pkl', names),
        lambda: get_normalized_score_lookup(train_data))
    test_normalized_score_lookup = get_normalized_score_lookup(
        test_candidates_data)
    val_normalized_score_lookup = get_normalized_score_lookup(
        val_candidates_data)
    if use_pointwise_loss:
        normalized_train_data = read_cache(
            name('./normalized_train_query_data_qrels_and_106756.json', names),
            lambda: normalize_scores_query_wise(train_data))
        collate_fn = lambda samples: collate_query_samples(
            samples,
            use_bow_model=use_bow_model,
            use_dense=rabbit.model_params.use_dense)
        train_dl = build_query_dataloader(
            documents,
            normalized_train_data,
            rabbit.train_params,
            rabbit.model_params,
            cache=name('train_ranking_qrels_and_106756.json', names),
            limit=10,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=train_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=False)
        test_dl = build_query_dataloader(
            documents,
            test_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=test_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_dl = build_query_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        model = PointwiseScorer(query_token_embeds, document_token_embeds,
                                doc_encoder, rabbit.model_params,
                                rabbit.train_params)
    else:
        if rabbit.train_params.use_noise_aware_loss:
            ranker_query_str_to_rankings = get_ranker_query_str_to_rankings(
                train_query_name_to_id,
                document_title_to_id,
                train_queries,
                limit=rabbit.train_params.num_snorkel_train_queries)
            query_names = reduce(
                lambda acc, query_to_ranking: acc.intersection(
                    set(query_to_ranking.keys()))
                if len(acc) != 0 else set(query_to_ranking.keys()),
                ranker_query_str_to_rankings.values(), set())
            all_ranked_lists_by_ranker = _.map_values(
                ranker_query_str_to_rankings, lambda query_to_ranking:
                [query_to_ranking[query] for query in query_names])
            ranker_query_str_to_pairwise_bins = get_ranker_query_str_to_pairwise_bins(
                train_query_name_to_id,
                document_title_to_id,
                train_queries,
                limit=rabbit.train_params.num_train_queries)
            snorkeller = Snorkeller(ranker_query_str_to_pairwise_bins)
            snorkeller.train(all_ranked_lists_by_ranker)
            calc_marginals = snorkeller.calc_marginals
        else:
            calc_marginals = None
        collate_fn = lambda samples: collate_query_pairwise_samples(
            samples,
            use_bow_model=use_bow_model,
            calc_marginals=calc_marginals,
            use_dense=rabbit.model_params.use_dense)
        if rabbit.run_params.load_influences:
            try:
                with open(rabbit.run_params.influences_path) as fh:
                    pairs_to_flip = defaultdict(set)
                    for pair, influence in json.load(fh):
                        if rabbit.train_params.use_pointwise_loss:
                            condition = True
                        else:
                            condition = influence < rabbit.train_params.influence_thresh
                        if condition:
                            query = tuple(pair[1])
                            pairs_to_flip[query].add(tuple(pair[0]))
            except FileNotFoundError:
                pairs_to_flip = None
        else:
            pairs_to_flip = None
        train_dl = build_query_pairwise_dataloader(
            documents,
            train_data,
            rabbit.train_params,
            rabbit.model_params,
            pairs_to_flip=pairs_to_flip,
            cache=name('train_ranking_qrels_and_106756.json', names),
            limit=10,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=train_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=False)
        test_dl = build_query_pairwise_dataloader(
            documents,
            test_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=test_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_dl = build_query_pairwise_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_rel_dl = build_query_pairwise_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True,
            rel_vs_irrel=True,
            candidates=val_ranking_candidates,
            num_to_rank=rabbit.run_params.num_to_rank)
        model = PairwiseScorer(query_token_embeds,
                               document_token_embeds,
                               doc_encoder,
                               rabbit.model_params,
                               rabbit.train_params,
                               use_bow_model=use_bow_model)
    train_ranking_dataset = RankingDataset(
        documents,
        train_dl.dataset.rankings,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        normalized_score_lookup=train_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    test_ranking_dataset = RankingDataset(
        documents,
        test_ranking_candidates,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        relevant=test_dl.dataset.rankings,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        cheat=rabbit.run_params.cheat,
        normalized_score_lookup=test_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    val_ranking_dataset = RankingDataset(
        documents,
        val_ranking_candidates,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        relevant=val_dl.dataset.rankings,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        cheat=rabbit.run_params.cheat,
        normalized_score_lookup=val_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    if rabbit.train_params.memorize_test:
        train_dl = test_dl
        train_ranking_dataset = test_ranking_dataset
    model_data = DataBunch(train_dl,
                           val_rel_dl,
                           test_dl,
                           collate_fn=collate_fn,
                           device=torch.device('cuda') if
                           torch.cuda.is_available() else torch.device('cpu'))
    multi_objective_model = MultiObjective(model, rabbit.train_params,
                                           rel_score, additive)
    model_to_save = multi_objective_model
    if rabbit.train_params.memorize_test:
        try:
            del train_data
        except:
            pass
    if not rabbit.run_params.just_caches:
        del document_lookup
        del train_query_lookup
    del query_token_lookup
    del document_token_lookup
    del train_queries
    try:
        del glove_lookup
    except UnboundLocalError:
        del q_glove_lookup
        del doc_glove_lookup
    if rabbit.run_params.load_model:
        try:
            multi_objective_model.load_state_dict(
                torch.load(rabbit.run_params.load_path))
        except RuntimeError:
            dp = nn.DataParallel(multi_objective_model)
            dp.load_state_dict(torch.load(rabbit.run_params.load_path))
            multi_objective_model = dp.module
    else:
        train_model(multi_objective_model, model_data, train_ranking_dataset,
                    val_ranking_dataset, test_ranking_dataset,
                    rabbit.train_params, rabbit.model_params,
                    rabbit.run_params, experiment)
    if rabbit.train_params.fine_tune_on_val:
        fine_tune_model_data = DataBunch(
            val_rel_dl,
            val_rel_dl,
            test_dl,
            collate_fn=collate_fn,
            device=torch.device('cuda')
            if torch.cuda.is_available() else torch.device('cpu'))
        train_model(multi_objective_model,
                    fine_tune_model_data,
                    val_ranking_dataset,
                    val_ranking_dataset,
                    test_ranking_dataset,
                    rabbit.train_params,
                    rabbit.model_params,
                    rabbit.run_params,
                    experiment,
                    load_path=rabbit.run_params.load_path)
    multi_objective_model.eval()
    device = model_data.device
    gpu_multi_objective_model = multi_objective_model.to(device)
    if rabbit.run_params.calc_influence:
        if rabbit.run_params.freeze_all_but_last_for_influence:
            last_layer_idx = _.find_last_index(
                multi_objective_model.model.pointwise_scorer.layers,
                lambda layer: isinstance(layer, nn.Linear))
            to_last_layer = lambda x: gpu_multi_objective_model(
                *x, to_idx=last_layer_idx)
            last_layer = gpu_multi_objective_model.model.pointwise_scorer.layers[
                last_layer_idx]
            diff_wrt = [p for p in last_layer.parameters() if p.requires_grad]
        else:
            diff_wrt = None
        test_hvps = calc_test_hvps(
            multi_objective_model.loss,
            gpu_multi_objective_model,
            DeviceDataLoader(train_dl, device, collate_fn=collate_fn),
            val_rel_dl,
            rabbit.run_params,
            diff_wrt=diff_wrt,
            show_progress=True,
            use_softrank_influence=rabbit.run_params.use_softrank_influence)
        influences = []
        if rabbit.train_params.use_pointwise_loss:
            num_real_samples = len(train_dl.dataset)
        else:
            num_real_samples = train_dl.dataset._num_pos_pairs
        if rabbit.run_params.freeze_all_but_last_for_influence:
            _sampler = SequentialSamplerWithLimit(train_dl.dataset,
                                                  num_real_samples)
            _batch_sampler = BatchSampler(_sampler,
                                          rabbit.train_params.batch_size,
                                          False)
            _dl = DataLoader(train_dl.dataset,
                             batch_sampler=_batch_sampler,
                             collate_fn=collate_fn)
            sequential_train_dl = DeviceDataLoader(_dl,
                                                   device,
                                                   collate_fn=collate_fn)
            influences = calc_dataset_influence(gpu_multi_objective_model,
                                                to_last_layer,
                                                sequential_train_dl,
                                                test_hvps,
                                                sum_p=True).tolist()
        else:
            for i in progressbar(range(num_real_samples)):
                train_sample = train_dl.dataset[i]
                x, labels = to_device(collate_fn([train_sample]), device)
                device_train_sample = (x, labels.squeeze())
                influences.append(
                    calc_influence(multi_objective_model.loss,
                                   gpu_multi_objective_model,
                                   device_train_sample,
                                   test_hvps,
                                   diff_wrt=diff_wrt).sum().tolist())
        with open(rabbit.run_params.influences_path, 'w+') as fh:
            json.dump([[train_dl.dataset[idx][1], influence]
                       for idx, influence in enumerate(influences)], fh)
import pandas as pd
import numpy as np
import os
import pickle

from pathlib import Path
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from fastai.text import Tokenizer
from eda import EDA

tok = Tokenizer()

def extract_features(df, out_path, max_features=30000, vectorizer=None):

	reviews = df['reviewText'].tolist()
	tok_reviews = tok.process_all(reviews)

	if vectorizer is None:
		vectorizer = TfidfVectorizer(max_features=max_features, preprocessor=lambda x: x,
			tokenizer = lambda x: x, min_df=5)
		X = vectorizer.fit_transform(tok_reviews)
	else:
		X = vectorizer.transform(tok_reviews)

	featset = Bunch(X=X, y=df.overall.values)
	pickle.dump(featset, open(out_path, 'wb'))

	return vectorizer