Exemple #1
0
 def __init__(self, lang="en"):
     super().__init__()
     self.package_check(lang)
     self.load_macros(lang)
     self.load_patterns(lang)
     if not Doc.has_extension('arguments'):
         Doc.set_extension('arguments', getter=ArgumentTexts(self))
     else:
         default, method, getter, setter = Doc.get_extension('arguments')
         assert isinstance(getter, ArgumentTexts), \
             "Expected 'arguments' extension to be of type ArgumentTexts " \
             "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments')))
Exemple #2
0
def load(path):
    """Load the data, spaCy model, and documents.

    Parameters
    ----------
    path : str
        Path to Steam review data CSV.

    Returns
    -------
    pd.DataFrame
        Review data.
    spacy.lang.en.English
        Language object with extra pipeline components.
    List[spacy.tokens.Doc]
        Documents passed through the NLP pipeline.
    """
    steam_rev = pd.read_csv(path, low_memory=False)
    normalize_words(steam_rev)
    # Steam reviews can be voted funny or given a thumbs up.
    # So...let's use these as our classes!
    steam_rev["up_funny"] = steam_rev.votes_up > steam_rev.votes_funny

    # Start with a pretrained model on blogs (maybe this is a bad idea?)
    nlp = spacy.load("en_core_web_md")

    # Store the title of each game and whether the review is positive/neg
    if not Doc.get_extension("is_recommended"):
        Doc.set_extension("is_recommended", default=np.nan)
    if not Doc.get_extension("title"):
        Doc.set_extension("title", default=np.nan)

    # Add extra pipeline components
    add_titles_ent_pipe(steam_rev, nlp)
    # Can't get this to work.
    # nlp.add_pipe("add_title",
    #             after="entity_ruler",
    #             config={"titles_iter": steam_rev.title.iteritems()})

    # Reviews are short so I'm using a large batch size.
    return steam_rev, nlp, np.array(list(nlp.pipe(steam_rev.user_review,
                                                  cleanup=True,
                                                  batch_size=256,
                                                  n_process=-1)),
                                    dtype=Doc)