Beispiel #1
0
def semantics(reply, stmt=None, **kwargs):
    global nlp
    nlp = kwargs.get('nlp', nlp)
    if kwargs is None or kwargs['nlp'] is None or not stmt:
        return 0.0

    cos_sim = nlp(reply).similarity(nlp(stmt))

    return cos_sim
def generate_sentence(spec=SENTENCE_SPEC, sentence_id=None):
    """ Generate random sentence using word probabilities specified in SENTENCE_SPEC

    >>> spec = {
    ...     "answers":[[{"HDL":0.95,"good_cholesterol":0.05}, {"150": 0.01,"145": 0.01,"unk": 0.98}],
    ...     "sentences":["Patient LDL level is 100, ________ level is 50, and the total is ______ .",]
    ...     }
    >>> s = generate_sentence(spec=spec, sentence_id=0)  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    >>> s
    'Patient LDL level is 100, ... level is 50, and the total is ... .'
    >>> s[26:42] in ('HDL level is 50,', 'good_cholesterol')
    True
    >>> s[60:63] in ('150', '145', 'unk')
    True
    """
    sentences = spec['sentences']
    if sentence_id is None:
        sentence_id = np.random.randint(0, len(sentences))
    sentence = sentences[sentence_id]
    answer = spec['answers'][sentence_id]
    i_unk = 0
    tokens = []
    for i, tok in enumerate(nlp(sentence)):
        if re.match(r'^(_+|unk|\[MASK\])$', tok.text):
            possible_tokens, p = list(zip(*answer[i_unk].items()))
            tokens.append(np.random.choice(a=possible_tokens, p=p))
            i_unk += 1
Beispiel #3
0
def get_sentences(df, size_limit=50000, vector_dim=None):
    vector_dim = len(
        nlp('word').vector) if vector_dim is None else int(vector_dim)
    sents = []
    docvecs = np.zeros((len(df), vector_dim))
    encodings = []
    for file_id, row in tqdm(df.iterrows(), total=len(df)):
        sentvecs = []
        encodings.append('utf8')
        if row['size'] <= size_limit and row['is_journal']:
            try:
                with open(row['path'], 'rb') as fin:
                    bintext = fin.read()
                try:
                    text = bintext.decode()
                except UnicodeDecodeError:
                    encodings[-1] = 'latin'
                    log.warning(f"LATIN?: {row['path']}")
                    text = bintext.decode('latin')
                doc = nlp(text)
            except UnicodeDecodeError:
                log.error(f"UnicodeDecodeError: {row['path']}")
                continue
            docvecs[file_id, :] = np.array(list(doc.vector))
            docsents = [
                dict(sentence_pos=f'{file_id}-{j}',
                     file_id=file_id,
                     text=s.text) for j, s in enumerate(doc.sents)
            ]
            log.info(f"Read {len(docsents)} sentences: {row['path']}")
            # print(doc.vector)
            sents.extend(docsents)
            sentvecs.extend([s.vector for s in doc.sents])
        else:
            log.warn(f"skipped {row['path']}")

    df['encoding'] = encodings
    df = pd.concat([df, pd.DataFrame(np.array(docvecs))], axis=1)
    df_sents = pd.DataFrame(sents, index=list(range(len(sents))))
    df_sents = pd.concat([df_sents, pd.DataFrame(np.array(sentvecs))], axis=1)
    df_sents.index.name = 'sentence_id'
    return df, df_sents
Beispiel #4
0
def semantics(reply, stmt=None, **kwargs):
    """ Compute word2vec docvec cosine similarity (fall back to character IOU)

    >>> semantics('Hello world!', 'Goodbye big earth!') > .5
    True
    """
    global nlp
    nlp = kwargs.get('nlp', nlp)
    if kwargs is None or nlp is None or not stmt or not reply:
        return 0.0

    reply_doc, stmt_doc = nlp(reply), nlp(stmt)

    if not reply_doc or not stmt_doc or not reply_doc.has_vector or not stmt_doc.has_vector:
        # FIXME: levenshtien would be better or fuzzywuzzy
        return iou(reply, stmt)

    cos_sim = nlp(reply).similarity(nlp(stmt))
    log.debug(f'cos_sim={cos_sim}')
    return cos_sim
Beispiel #5
0
def term_vector_dict(terms, keys=None):
    terms = [str(t) if t else '' for t in terms]
    keys = terms if keys is None else list(keys)
    vector_list = []
    log.info(f'Computing doc vectors for {len(terms)} terms...')
    for k, term in zip(keys, terms):
        vec = nlp(term).vector  # s can sometimes (rarely) be a float because of pd.read_csv (df_titles)
        vec /= np.linalg.norm(vec) or 1.
        # vec = vec.round(7)
        mask_zeros = np.abs(vec) > 0
        if mask_zeros.sum() < len(mask_zeros):
            log.warning(f'BAD VEC: {term} [0]*{mask_zeros.sum()}')
        vector_list.append((k, vec))
    # columns = [f'x{i}' for i in range(300)''
    # dtypes = {c: pd.np.float16 for c in columns}
    # df_vectors
    # dtypes.update(page_title=str)
    # self.df_vectors = pd.read_csv(filepath, dtype=dtypes)
    return dict(vector_list)
Beispiel #6
0
def search_csv(
        csv_path='/midata/private/journal/files.csv',
        query='Misima island port harbor derelict ship PNG Papua New Guinnea Australis harbor storm sailing cliffs anchor drag',
        num_results=10,
        num_dims=300):
    df = pd.read_csv(csv_path, index_col=0)
    index_path = os.path.join(os.path.dirname(csv_path), 'files_index.ann')
    index = AnnoyIndex(f=num_dims)
    index.load(index_path)
    vec = nlp(query).vector
    paths = []
    for i in index.get_nns_by_vector(vec, num_results):
        path = df.iloc[i]['path']
        paths.append(path)
        print(path)
        with open(path, 'rb') as fin:
            bintext = b''.join(fin.readlines()[:10])
        try:
            text = bintext.decode()
        except UnicodeDecodeError:
            text = bintext.decode('latin')
        print(text)
        print('-' * 120)
    return
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                    see_also=True, max_articles=10000, max_depth=3):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> df = scrape_articles(['ELIZA'], see_also=False)
    >>> df.shape
    (87, 3)
    >>> df.columns
    Index(['title', 'section', 'sentence'], dtype='object')
    """

    titles = list([titles] if isinstance(titles, str) else titles)
    exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
    depths = list([0] * len(titles))
    title_depths = list(zip(titles, depths))
    sentences = []
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    title, d = '', 0
    wiki = Wikipedia()
    for depth in range(max_depth):
        for i in range(max_articles):
            title = None
            while not title or title in titles_scraped:
                # log.warn(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.warn(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            page = wiki.article(title)
            if not (len(page.text) + len(page.summary)):
                log.error(f"Unable to retrieve {title}")
                time.sleep(2.17)
                continue
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    if t in page.links:
                        title_depths.append((t, d + 1))
                log.debug(f'extended title_depths at depth {d}: {title_depths}')
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text = section.text.replace('’', "'")  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
                # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...'
                # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly...
                sentences.extend([
                    (d, title, section.title, s.text) for s in nlp(text).sents if (
                        len(s.text.strip().strip('"').strip("'").strip()) > 1)
                ])
            log.debug(f'Parsed {len(sentences)} sentences.')

            # retval = parse_sentences(
            #     title=title, sentences=sentences, title_depths=title_depths, see_also=see_also,
            #     exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth)
            # if retval is None:
            #     continue
            # else:
            #     sentences, title_depths = retval
            log.info(str([depth, d, i, title]))
            if d > depth:
                log.info(f"{d} > {depth}")
                break

    return pd.DataFrame(sentences, columns='depth title section sentence'.split())