コード例 #1
0
ファイル: embeddings.py プロジェクト: codegram/deepspain
def word_embeddings(learner: LanguageLearner,
                    s: str,
                    debug: bool = False) -> Tensor:
    tokens, _ = measure("tokenizing", lambda: learner.data.one_item(s), debug)
    measure("resetting model", lambda: learner.model.reset(), debug)
    encoder = learner.model[0]
    outputs = measure("predicting", lambda: encoder(tokens), debug)
    embeddings = outputs[-1][-1]
    return embeddings
コード例 #2
0
ファイル: evaluate.py プロジェクト: codegram/deepspain
def main(models_path: Path, test_data_json: Path, debug: bool):
    """Evaluates a language model against a test data set."""

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)

        print(f"Loading test data from {test_data_json}...")
        rows = []
        with jsonlines.open(test_data_json) as reader:
            for obj in reader.iter(type=dict, skip_invalid=True):
                rows.append(obj)
        df = pd.DataFrame(rows)
        test_databunch = (TextList.from_df(
            df, path=models_path,
            cols=["title",
                  "content"]).split_none().label_for_lm().databunch(bs=4))

        learner = measure(
            "model loading",
            lambda: from_model(models_path, model_name="model_large_finetuned"
                               ),
            debug,
        )

        print(learner.validate(dl=test_databunch.train_dl))
コード例 #3
0
ファイル: search.py プロジェクト: codegram/deepspain
def search(
    es: Elasticsearch,
    learner: LanguageLearner,
    index_name: str,
    query: str,
    debug=False,
):
    embeddings = doc2vec(learner, query, debug)
    # embeddings = [embeddings[0]]
    indices = range(len(embeddings))
    with_index = zip(indices, embeddings)
    params = {"queryVector" + str(idx): e.tolist() for idx, e in with_index}
    queries = [
        "cosineSimilarity(params.queryVector" + str(idx) +
        ", doc['embeddings_" + str(idx) + "'])" for idx in indices
    ]
    q = {
        "size": 1,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "+".join(queries) + "+0.0",
                    "params": params
                },
            }
        },
    }
    result = measure("search", lambda: es.search(index=index_name, body=q),
                     debug)
    return result["hits"]["hits"][0]["_source"]["title"]
コード例 #4
0
ファイル: embeddings.py プロジェクト: codegram/deepspain
def doc2vec(learner: LanguageLearner,
            s: str,
            debug: bool = False,
            max_dim: int = 1024) -> Sequence[Tensor]:
    with torch.no_grad():
        embeddings = measure("get_full_embeddings",
                             lambda: word_embeddings(learner, s, debug), debug)
        avg_pool = embeddings.mean(dim=1)
        max_pool = embeddings.max(dim=1)[0]
        last = cast(Tensor,
                    cast(Any, embeddings)[:, -1])  # workaround pyright issue
        return (torch.cat([last, max_pool, avg_pool],
                          1).to("cpu").squeeze().split(max_dim))
コード例 #5
0
ファイル: index_documents.py プロジェクト: codegram/deepspain
def main(
    models_path: Path,
    data_path: Path,
    drop_index: bool,
    index_name: str,
    host: str,
    port: int,
    limit_bytes: int,
    debug: bool,
):
    """Index all the training rows in <databunch.pkl> into ElasticSearch."""

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)

        learner = measure(
            "encoder loading",
            lambda: from_encoder(models_path,
                                 encoder_name="encoder_large_finetuned"),
            debug,
        )

        es = Elasticsearch(hosts=[{"host": host, "port": port}])
        if drop_index:
            print("Recreating index...")
            recreate_index(es, learner, index_name, debug)
        print("Loading data...")
        df = load_databunch(Path(data_path), debug).train_ds.inner_df
        total = df.shape[0]
        print(f"Indexing {total} rows...")
        for idx, row in df.iterrows():
            measure(
                f"{idx}/{total}",
                lambda: index_document(es, learner, index_name, row.to_dict(),
                                       limit_bytes, debug),
                debug,
            )
コード例 #6
0
ファイル: search.py プロジェクト: codegram/deepspain
def main(
    models_path: Path,
    query: str,
    index_name: str = "boe",
    host: str = "localhost",
    port: int = 9200,
    debug: bool = False,
):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        learner = measure(
            "model loading",
            lambda: from_encoder(models_path, encoder_name="encoder_large_finetuned"),
            debug,
        )
        es = Elasticsearch(hosts=[{"host": host, "port": port}])
        for result in search(es, learner, index_name, query, debug):
            print("\n")
            print(result)
コード例 #7
0
ファイル: dataset.py プロジェクト: codegram/deepspain
def load_databunch(pkl_path: Path, debug=False) -> TextLMDataBunch:
    p = pkl_path
    folder = p.parent
    filename = p.name
    return measure("loading dataframe", lambda: load_data(folder, filename),
                   debug)