Esempio n. 1
0
    def __init__(self, text_or_sdoc, spacy_pipeline=None, lang=None, metadata=None):
        self.metadata = {} if metadata is None else metadata
        self._term_counts = Counter()

        if isinstance(text_or_sdoc, str):
            self.lang = text_utils.detect_language(text_or_sdoc) if not lang else lang
            if spacy_pipeline is None:
                spacy_pipeline = data.load_spacy(self.lang)
            # check for match between text and passed spacy_pipeline language
            else:
                if spacy_pipeline.lang != self.lang:
                    msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format(
                        self.lang, spacy_pipeline.lang)
                    raise ValueError(msg)
            self.spacy_vocab = spacy_pipeline.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_pipeline(text_or_sdoc)

        elif isinstance(text_or_sdoc, sdoc):
            self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \
                text_utils.detect_language(text_or_sdoc.text_with_ws)
            self.spacy_vocab = text_or_sdoc.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = text_or_sdoc

        else:
            msg = 'TextDoc must be initialized with {}, not {}'.format(
                {str, sdoc}, type(text_or_sdoc))
            raise ValueError(msg)
Esempio n. 2
0
    def __init__(self,
                 text_or_sdoc,
                 spacy_pipeline=None,
                 lang=None,
                 metadata=None):
        self.metadata = {} if metadata is None else metadata
        self._term_counts = Counter()

        if isinstance(text_or_sdoc, string_types):
            self.lang = text_utils.detect_language(
                text_or_sdoc) if not lang else lang
            if spacy_pipeline is None:
                spacy_pipeline = data.load_spacy(self.lang)
            # check for match between text and passed spacy_pipeline language
            else:
                if spacy_pipeline.lang != self.lang:
                    msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format(
                        self.lang, spacy_pipeline.lang)
                    raise ValueError(msg)
            self.spacy_vocab = spacy_pipeline.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_pipeline(text_or_sdoc)

        elif isinstance(text_or_sdoc, sdoc):
            self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \
                text_utils.detect_language(text_or_sdoc.text_with_ws)
            self.spacy_vocab = text_or_sdoc.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = text_or_sdoc

        else:
            msg = 'TextDoc must be initialized with {}, not {}'.format(
                {str, sdoc}, type(text_or_sdoc))
            raise ValueError(msg)
Esempio n. 3
0
def extract_responses(filepath, writer):
    with open(filepath) as input_file:
        reader = csv.reader(input_file, quoting=csv.QUOTE_MINIMAL)
        deleted = "deleted"
        for line in reader:
            if (deleted not in line[0]) and (deleted not in line[1]):
                preprocessed_line = preprocess(line[1])
                try:
                    if detect_language(preprocessed_line) == 'en':
                        writer.writerow([preprocessed_line])
                except ValueError:
                    continue
Esempio n. 4
0
    def __init__(self, content, metadata=None, lang=None):
        self.metadata = metadata or {}

        # Doc instantiated from text, so must be parsed with a spacy.Language
        if isinstance(content, unicode_type):
            if isinstance(lang, SpacyLang):
                self.lang = lang.lang
                spacy_lang = lang
            elif isinstance(lang, unicode_type):
                self.lang = lang
                spacy_lang = data.load_spacy(self.lang)
            elif lang is None:
                self.lang = text_utils.detect_language(content)
                spacy_lang = data.load_spacy(self.lang)
            else:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
            self.spacy_vocab = spacy_lang.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_lang(content)
        # Doc instantiated from an already-parsed spacy.Doc
        elif isinstance(content, SpacyDoc):
            self.spacy_vocab = content.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = content
            self.lang = self.spacy_vocab.lang
            # these checks are probably unnecessary, but in case a user
            # has done something very strange, we should complain...
            if isinstance(lang, SpacyLang):
                if self.spacy_vocab is not lang.vocab:
                    msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`'
                    raise ValueError(msg)
            elif isinstance(lang, unicode_type):
                if lang != self.lang:
                    raise ValueError(
                        'lang of spacy models used to parse `content` must be the same as `lang`'
                    )
            elif lang is not None:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
        # oops, user has made some sort of mistake
        else:
            msg = '`Doc` must be initialized with {}, not "{}"'.format(
                {unicode_type, SpacyDoc}, type(content))
            raise ValueError(msg)
Esempio n. 5
0
    def __init__(self, content, metadata=None, lang=None):
        self.metadata = metadata or {}

        # Doc instantiated from text, so must be parsed with a spacy.Language
        if isinstance(content, unicode_type):
            if isinstance(lang, SpacyLang):
                self.lang = lang.lang
                spacy_lang = lang
            elif isinstance(lang, unicode_type):
                self.lang = lang
                spacy_lang = data.load_spacy(self.lang)
            elif lang is None:
                self.lang = text_utils.detect_language(content)
                spacy_lang = data.load_spacy(self.lang)
            else:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
            self.spacy_vocab = spacy_lang.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_lang(content)
        # Doc instantiated from an already-parsed spacy.Doc
        elif isinstance(content, SpacyDoc):
            self.spacy_vocab = content.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = content
            self.lang = self.spacy_vocab.lang
            # these checks are probably unnecessary, but in case a user
            # has done something very strange, we should complain...
            if isinstance(lang, SpacyLang):
                if self.spacy_vocab is not lang.vocab:
                    msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`'
                    raise ValueError(msg)
            elif isinstance(lang, unicode_type):
                if lang != self.lang:
                    raise ValueError('lang of spacy models used to parse `content` must be the same as `lang`')
            elif lang is not None:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
        # oops, user has made some sort of mistake
        else:
            msg = '`Doc` must be initialized with {}, not "{}"'.format(
                {unicode_type, SpacyDoc}, type(content))
            raise ValueError(msg)
Esempio n. 6
0
 def __init__(self, text, spacy_pipeline=None, lang='auto',
              metadata=None, max_cachesize=5):
     self.metadata = {} if metadata is None else metadata
     self.lang = text_utils.detect_language(text) if lang == 'auto' else lang
     if spacy_pipeline is None:
         self.spacy_pipeline = data.load_spacy_pipeline(lang=self.lang)
     else:
         # check for match between text and supplied spacy pipeline language
         if spacy_pipeline.lang != self.lang:
             msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format(
                 self.lang, spacy_pipeline.lang)
             raise ValueError(msg)
         else:
             self.spacy_pipeline = spacy_pipeline
     self.spacy_vocab = self.spacy_pipeline.vocab
     self.spacy_stringstore = self.spacy_vocab.strings
     self.spacy_doc = self.spacy_pipeline(text)
     self._term_counts = Counter()
     self._cache = LRUCache(maxsize=max_cachesize)
from src.utils import preprocess

if __name__ == '__main__':
    EMOTION_DATAPATH = 'data/processed/emotions_full.csv'
    FASTTEXT_FULL_FILE = 'data/processed/fasttext_full.txt'
    MODEL_PATH = 'models/emotion_classification/fasttext/model'
    label_prefix = '__label__'
    texts = []
    labels = []
    with open(EMOTION_DATAPATH) as data_file:
        reader = csv.reader(data_file, quoting=csv.QUOTE_MINIMAL)
        reader.__next__()
        for i, line in enumerate(reader):
            preprocessed_line = preprocess(line[1])
            if detect_language(preprocessed_line) == 'en':
                doc = textacy.Doc(preprocessed_line, lang='en_core_web_lg')
                texts.append(doc)
                labels.append(line[2])

    with open(FASTTEXT_FULL_FILE, 'w') as input_file:
        for x, y in zip(texts, labels):
            input_file.write(' , '.join(
                [label_prefix +
                 str(y), x.text.replace('\n', '')]) + '\n')

    # Hypertuned by fasttext_hypertuning.py
    dim = 300
    lr = 0.1
    epoch = 10
    word_ngrams = 1
Esempio n. 8
0
 def test_detect_language(self):
     for lang, sent in LANG_SENTS:
         self.assertEqual(text_utils.detect_language(sent), lang)
Esempio n. 9
0
def test_detect_language():
    for lang, sent in LANG_SENTS:
        assert text_utils.detect_language(sent) == lang
Esempio n. 10
0
 def test_detect_language(self):
     for lang, sent in LANG_SENTS:
         self.assertEqual(text_utils.detect_language(sent), lang)