Exemple #1
0
def index(client, freq_file, lang):
    tweets = client['twitter_'+lang]['tweets']
    # freq_dict = load(freq_file)
    freq_dict = defaultdict(int)

    i = 0
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1

    # for the second db (tr)
    tweets = client['new_'+lang]['tweets']
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1


    save(freq_file, freq_dict)
def char_split_if_io_example(sentence):
    '''ADD SECTION FOR CONVERTING EVERYTHING TO LOWER CASE'''
    """split text into characters"""
    """used for input/output examples for which char level info is relevant"""
    i = 'Input ¶'
    o = 'Output ¶'
    '''
	i = 'Input \xb6'
	o = 'Output \xb6'
	'''

    sentence_encoded = sentence
    sentence = sentence.decode('utf-8')

    if i in sentence_encoded:
        sentence = sentence_encoded.split(i)
        sentence = word_tokenizer(i.decode('utf-8')) + list(sentence[1])
        s = sentence
        for jdx, j in enumerate(sentence):
            if j == '\xc2':
                s[jdx:jdx + 2] = [u'\xb6']
        sentence = s
    elif o in sentence_encoded:
        sentence = sentence_encoded.split(o)
        sentence = word_tokenizer(o.decode('utf-8')) + list(sentence[1])
        s = sentence
        for jdx, j in enumerate(sentence):
            if j == '\xc2':
                s[jdx:jdx + 2] = [u'\xb6']
        sentence = s
    else:
        sentence = word_tokenizer(sentence)
    return sentence
Exemple #3
0
def validate_keywords_contexts_offsets(source,
                                       index_type='es',
                                       extract_contexts=False):
    """ Validate an updated document """
    assert 'keyterms' in source and 'offsets' in source
    # Keyterms are stored as a json string
    keyterms = source['keyterms']
    offsets = source['offsets']
    if keyterms is None:
        # When keyterms is None it is because the extracted keyterms could not meet the filter criteria
        # (i.e. min_bg_count). This should only happen when min_bg_count > 1
        return
    if index_type == 'es':
        field_text = source[FIELD_NAME]
    else:
        # Mongo doesn't have the raw text field; get from ES
        doc_id = source['_id']
        es_source = get_es_source(doc_id)
        field_text = es_source[FIELD_NAME]

    # Because of ambiguity with punctuation when extracting keyterms, we
    # include tokens split by punct and removed punct
    field_tokens_set = set(tokenizer.word_tokenizer(field_text))
    field_tokens_set = expand_tokens_set_to_split_by_punct(field_tokens_set)
    # To account for how ES handles punctuation, we use both tokenized and non-tokenized forms of each token
    for k in keyterms:
        try:
            assert k in field_tokens_set
        except AssertionError:
            # Try the keyterm without punct
            assert k.translate(str.maketrans(
                '', '', string.punctuation)) in field_tokens_set

    # Check that the keyterm offsets correspond with the raw text field
    for keyterm, offset in zip(keyterms, offsets):
        for start_idx, end_idx in offset:
            assert field_text[start_idx:end_idx] == keyterm

    if extract_contexts:
        assert 'contexts' in source
        contexts = source['contexts']
        # Check that the contexts cover all the identified keyterms
        context_tokens_set = set()
        for ctx in contexts:
            context_tokens_set = context_tokens_set.union(
                tokenizer.word_tokenizer(ctx))
            context_tokens_set = expand_tokens_set_to_split_by_punct(
                context_tokens_set)
        for keyterm in keyterms:
            try:
                assert keyterm in context_tokens_set
            except AssertionError:
                # Try the keyterm without punct
                assert keyterm.translate(
                    str.maketrans('', '',
                                  string.punctuation)) in context_tokens_set
Exemple #4
0
    def __next__(self):
        if self._curr_row is None:
            raise StopIteration()

        row = self._curr_row

        if len(row) != self._row_len:
            msg = 'found %d columns, but expected %d at line %s:\n%s'
            raise IOError(msg % (
                len(row), self._row_len, self._line, str(row)
            ))

        try:
            self._curr_row = next(self._row_gen)
            self._line += 1
        except StopIteration:
            self._curr_row = None

        data_or_text = lambda c: row[c] if c not in self.text_columns else []
        data = [data_or_text(col) for col in range(self._row_len)]

        for col in self.text_columns:
            for sentence in split_single(row[col]):
                sentence = self._decap(sentence)
                tokens = [self._lower(t) for t in word_tokenizer(sentence)]
                data[col].append(tokens)

        return data
Exemple #5
0
    def __next__(self):
        if self._curr_row is None:
            raise StopIteration()

        row = self._curr_row

        if len(row) != self._row_len:
            msg = 'found %d columns, but expected %d at line %s:\n%s'
            raise IOError(msg %
                          (len(row), self._row_len, self._line, str(row)))

        try:
            self._curr_row = next(self._row_gen)
            self._line += 1
        except StopIteration:
            self._curr_row = None

        data_or_text = lambda c: row[c] if c not in self.text_columns else []
        data = [data_or_text(col) for col in range(self._row_len)]

        for col in self.text_columns:
            for sentence in split_single(row[col]):
                sentence = self._decap(sentence)
                tokens = [self._lower(t) for t in word_tokenizer(sentence)]
                data[col].append(tokens)

        return data
Exemple #6
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: List[str] = None):

        self.tokens: List[Token] = []

        self.labels: List[str] = labels

        self._embeddings: Dict = {}

        # optionally, directly instantiate with sentence tokens
        if text is not None:

            # tokenize the text first if option selected, otherwise assumes whitespace tokenized text
            if use_tokenizer:
                sentences = split_single(text)
                tokens = []
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                text = ' '.join(tokens)

            # add each word in tokenized string as Token object to Sentence
            for word in text.split(' '):
                self.add_token(Token(word))
Exemple #7
0
def tokenize(text,
             segment=True,
             norm=True,
             unique=False,
             min_len=2,
             max_sent=0):
    '''
    Tokenize text using SegTok segmenter and tokenizer.
    '''
    sentences = split_multi(text) if segment else [text]

    tokens = []

    for i, s in enumerate(sentences):
        if max_sent and i >= max_sent:
            break
        tokens += word_tokenizer(s)

    if unique:
        tokens = list(set(tokens))

    if min_len:
        tokens = [t for t in tokens if len(t) >= min_len]

    if norm:
        tokens = [w for t in tokens for w in normalize(t).split()]

    return tokens
    def tokenize(self, review, should_stem = True):
        cleaned_review = clean_sentence(review.lower())

        if should_stem:
            cleaned_review = " ".join([self.stemmer.stem(word) for word in cleaned_review.split()])

        tokenized_review = tokenizer.word_tokenizer(cleaned_review)

        return tokenized_review
Exemple #9
0
    def run_tokenize(text: str) -> List[str]:
        words: List[str] = []

        sentences = split_single(text)
        for sentence in sentences:
            contractions = split_contractions(word_tokenizer(sentence))
            words.extend(contractions)

        words = list(filter(None, words))

        return words
def tokenize(text):
    """
    Inputs: txt
    Outputs: tokens tokenized by segtok.tokenizer
    """
    tokens = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        tokens.extend(contractions)
    return tokens
Exemple #11
0
 def tokenize(self, text):
     """
     tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
     """
     words = [] # list of words
     # text = text.decode('utf-8')
     text = filter_pattern.sub(' ', text)
     for sent in split_multi(text):
         for token in word_tokenizer(sent):
             words.append(token.encode('utf-8', 'ignore'))
     return words
Exemple #12
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    token = Token(word)
                    self.add_token(token)
                    try:
                        word_offset = index(word, running_offset)
                    except:
                        word_offset = last_word_offset + 1
                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False
                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # add each word in tokenized string as Token object to Sentence
                for word in text.split(' '):
                    if word:
                        token = Token(word)
                        self.add_token(token)
def tokenize_old(output_file, db = 'crawler'):
    client = MongoClient()
    texts = client[db]['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find():
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
            f.write('\n')
    f.close()
Exemple #14
0
 def __init__(self, text=None, use_tokenizer=False, labels=None):
     super(Sentence, self).__init__()
     self.tokens = []
     self.labels = []
     if (labels is not None):
         self.add_labels(labels)
     self._embeddings = {}
     if (text is not None):
         if use_tokenizer:
             tokens = []
             sentences = split_single(text)
             for sentence in sentences:
                 contractions = split_contractions(word_tokenizer(sentence))
                 tokens.extend(contractions)
             index = text.index
             running_offset = 0
             last_word_offset = (-1)
             last_token = None
             for word in tokens:
                 try:
                     word_offset = index(word, running_offset)
                     start_position = word_offset
                 except:
                     word_offset = (last_word_offset + 1)
                     start_position = ((running_offset + 1) if
                                       (running_offset > 0) else
                                       running_offset)
                 token = Token(word, start_position=start_position)
                 self.add_token(token)
                 if (((word_offset - 1) == last_word_offset)
                         and (last_token is not None)):
                     last_token.whitespace_after = False
                 word_len = len(word)
                 running_offset = (word_offset + word_len)
                 last_word_offset = (running_offset - 1)
                 last_token = token
         else:
             word = u''
             for (index, char) in enumerate(text):
                 if (char == u' '):
                     if (len(word) > 0):
                         token = Token(word,
                                       start_position=(index - len(word)))
                         self.add_token(token)
                     word = u''
                 else:
                     word += char
             index += 1
             if (len(word) > 0):
                 token = Token(word, start_position=(index - len(word)))
                 self.add_token(token)
 def parse_words(self):
     stemmer = SnowballStemmer("english")
     with jsonlines.open(self.src) as reader:
         for obj in tqdm(reader.iter(type=dict, skip_invalid=True)):
             # review = tokenizer.word_tokenizer(obj["text"].lower())
             # curr_words += review
             review = obj["text"]
             cleaned_review = clean_sentence(review)
             
             # stemmed_review = " ".join([stemmer.stem(word) for word in cleaned_review.split()]) UNCOMMENT THIS LINE FOR STEMMING
             tokenized_review = tokenizer.word_tokenizer(cleaned_review.lower())
             self.words.update(tokenized_review)
             
     print(len(self.words), "unique total words")
Exemple #16
0
def read(origin_file, freq_file, lang):
    freq_dict = defaultdict(int)
    i = 0
    for line in open(origin_file):
        i += 1
        if i % 100000 == 0:
            print i
        items = line.strip().split(',', 3)
        if len(items) == 4 and items[0] == lang:
            # text = items[3].lower().decode('utf-8')
            text = items[3].decode('utf-8')
            text = re.sub(filter_pattern, '', text)
            for sent in split_multi(text):
                for word in word_tokenizer(sent):
                    freq_dict[word] += 1
    save(freq_file, freq_dict)
Exemple #17
0
def segtok_tokenizer(text: str) -> List[Token]:
    """
    Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages.
    https://github.com/fnl/segtok
    """
    tokens: List[Token] = []

    words: List[str] = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        words.extend(contractions)

    words = list(filter(None, words))

    # determine offsets for whitespace_after field
    index = text.index
    current_offset = 0
    previous_word_offset = -1
    previous_token = None
    for word in words:
        #try:
        word_offset = index(word, current_offset)
        start_position = word_offset
        #except:
        #    word_offset = previous_word_offset + 1
        #    start_position = (
        #        current_offset + 1 if current_offset > 0 else current_offset
        #    )

        if word:
            token = Token(text=word,
                          start_position=start_position,
                          whitespace_after=True)
            tokens.append(token)

        if (previous_token
                is not None) and word_offset - 1 == previous_word_offset:
            previous_token.whitespace_after = False

        current_offset = word_offset + len(word)
        previous_word_offset = current_offset - 1
        previous_token = token

    return tokens
Exemple #18
0
def tokenize_on_date(output_file, date = '2015-07-05'):
    client = MongoClient()
    texts = client['crawler']['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find({'date': date}):
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                if re.search("'s$", token):
                    f.write('%s\t%s\n' % (token[:-2].encode('utf-8', 'ignore'), 'X'))
                    f.write('%s\t%s\n' % (token[-2:].encode('utf-8', 'ignore'), 'X'))
                else:
                    f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
                    
            f.write('\n')
    f.close()
Exemple #19
0
 def _n_grams(self):
     """Make n_grams dict from list of messages."""
     main_dict = defaultdict(dict)
     for record in self.messages_list:
         tokenized = word_tokenizer(record)
         tokenized.insert(0, "<start_token>")
         tokenized.append("<end_token>")
         # TODO: find a better way to insert start/end tokens
         n_gramed = ngrams(tokenized, self.n_gram)
         for n_gram in n_gramed:
             # instead of counter, so we iterate over only once
             # but with if statement
             # TODO: estimate speed of two approaches
             if main_dict.get(n_gram[:-1], {}).get(n_gram[-1]):
                 main_dict[n_gram[:-1]][n_gram[-1]] += 1
             else:
                 main_dict[n_gram[:-1]][n_gram[-1]] = 1
     self.lm_dict = main_dict
Exemple #20
0
def preprocess_capitalization(text):
    words = tokenizer.word_tokenizer(text)
    final_words = []
    for word in words:
        if not word.isalpha():
            final_words.append(word.lower())
        else:
            if word.islower():
                pass
            elif word.isupper():
                final_words.append("⇧")
            elif word[0].isupper() and word[1:].islower():
                final_words.append("↑")
            else:
                final_words.append("↑")

            final_words.append(word.lower())
    return " ".join(final_words)
Exemple #21
0
 def sentences_iterator(self, log_every=10000):
     # Do a full pass over the data set
     c = 0
     for batch in self.es_utility.scroll_indexed_data(
             self.es_field_name, self.must_have_fields,
             self.must_not_have_fields, self.use_analyzed_field):
         for d in batch:
             if self.use_analyzed_field:
                 tokens = self.extract_tokens_from_termvectors(
                     d, self.es_field_name)
             else:
                 source = d['_source']
                 text = source[self.es_field_name]
                 tokens = tokenizer.word_tokenizer(text)
             yield tokens
             c += 1
             if c % log_every == 0:
                 print("Processed {} documents".format(c))
Exemple #22
0
    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = [] # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis
        
        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts
Exemple #23
0
def _html_tokenize(sentence):
    """Tokenize string into words, not splitting URIS or emails, wrapping segtok:word_tokenizer.

    It does not split URIs or e-mail addresses. It does not treat html escapes
    as single characters outside of these instances.(eg. &amp; -> '&', 'amp', ';')

    Args:
        sentence: input string for tokenization

    Returns:
        tokens: list of str
    """
    tokens = []
    for i, span in enumerate(web_tokenizer.split(sentence)):
        if i % 2:
            tokens.append(span)
        else:
            tokens.extend(word_tokenizer(span))
    return tokens
Exemple #24
0
    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [
        ]  # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = []  # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis

        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts
Exemple #25
0
    def run_tokenize(text: str) -> List[Token]:
        tokens: List[Token] = []
        words: List[str] = []

        sentences = split_single(text)
        for sentence in sentences:
            contractions = split_contractions(word_tokenizer(sentence))
            words.extend(contractions)

        words = list(filter(None, words))

        # determine offsets for whitespace_after field
        index = text.index
        current_offset = 0
        previous_word_offset = -1
        previous_token = None
        for word in words:
            try:
                word_offset = index(word, current_offset)
                start_position = word_offset
            except:
                word_offset = previous_word_offset + 1
                start_position = (current_offset +
                                  1 if current_offset > 0 else current_offset)

            if word:
                token = Token(text=word,
                              start_position=start_position,
                              whitespace_after=True)
                tokens.append(token)

            if (previous_token
                    is not None) and word_offset - 1 == previous_word_offset:
                previous_token.whitespace_after = False

            current_offset = word_offset + len(word)
            previous_word_offset = current_offset - 1
            previous_token = token

        return tokens
Exemple #26
0
def textrank(text, hdr):
    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    # tokenizing for words
    sentences = [sentence for sentence in split_multi(text)]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))

    words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha())
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code
Exemple #27
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: str = 'split',
                 labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer == 'segtok':

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = running_offset + 1 if running_offset > 0 else running_offset

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            elif use_tokenizer == 'split':
                # add each word in tokenized string as Token object to Sentence
                offset = 0
                for word in text.split(' '):
                    if word:
                        try:
                            word_offset = text.index(word, offset)
                        except:
                            word_offset = offset

                        token = Token(word, start_position=word_offset)
                        self.add_token(token)
                        offset += len(word) + 1
            elif use_tokenizer == 'toki':
                cmd = ['toki-app', '-q', '-n', '-c', 'nkjp']
                p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)
                stdout = p.communicate(input=text.encode('utf-8'))[0]
                offset = 0
                print(stdout.decode('utf-8').split('\n'))
                for t in stdout.decode('utf-8').split(
                        '\n')[:-2]:  #omit last two newlines
                    print('XX', t)
                    m = re.match(r'^(.*)/[tp]:(none|space|newline)', t)
                    word = m.group(1)
                    # before=m.group(2)
                    # print(word, text)
                    word_offset = text.index(word, offset)

                    token = Token(word, start_position=word_offset)
                    self.add_token(token)
                    offset = word_offset + len(word)
def getScore():
    if request.method == 'POST':
        data_received = request.form['mydata'].lower()
        is_url = validators.url(data_received)

        headline = ""
        if not is_url:
            # if we didn't get a url we got a headline as our data
            headline = data_received
        else:
            # we need to go fetch the headline from the url
            source = requests.get(data_received)
            soup = bs.BeautifulSoup(source.content, features='html.parser')
            headline = " "
            found_headline = False
            h1_tags = soup.find_all('h1')
            for h1_tag in h1_tags:
                potential_text = h1_tag.find(text=True, recursive=True)
                if len(potential_text) > 1:
                    headline = potential_text
                    found_headline = True

            if not found_headline:
                resp = make_response()
                resp.status = 400
                return resp

        with tf.Session(graph=default_graph) as sess:
            model.saver.restore(sess, model_file)

            tokenized = tokenizer.word_tokenizer(headline)
            numerized = numerize_sequence(tokenized)

            padded, mask = pad_sequence(numerized, padI, input_length)

            hl_element = {}
            hl_element['tokenized'] = tokenized
            hl_element['numerized'] = padded
            hl_element['mask'] = mask
            d_hl = [hl_element]
            hl_input, hl_target, hl_target_mask = build_batch(d_hl, 1)
            feed = {
                model.input_num: hl_input,
                model.targets: hl_target,
                model.targets_mask: hl_target_mask
            }
            loss = sess.run([model.loss], feed_dict=feed)[0]

            analysis = ''

            if loss < 7.00:
                analysis = "The headline is not unusual (not impactful). The article may not have any affect on cryptocurrency prices."
            elif loss < 15.00:
                analysis = "The headline is unusual (potentially impactful). The article may have an impact on cryptocurrency prices. We recommend reading the article!"
            else:
                analysis = "The headline is highly unusual (either potentially very impactful or not related to cryptocurrency). If the article is related to cryptocurrency, we recommend reading the article in detail!"

            resp = make_response('{"loss": ' + str(loss) + ', "headline": "' +
                                 headline + '"' + ', "analysis": "' +
                                 analysis + '"' + '}')
            resp.headers['Content-Type'] = "application/json"

            return resp
Exemple #29
0
    def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = running_offset + 1 if running_offset > 0 else running_offset

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # catch the empty string case
                if not text:
                    raise ValueError("Cannot convert empty string to a Sentence object.")
                # add each word in tokenized string as Token object to Sentence
                word = ''
                for index, char in enumerate(text):
                    if char == ' ':
                        if len(word) > 0:
                            token = Token(word, start_position=index-len(word))
                            self.add_token(token)

                        word = ''
                    else:
                        word += char
                # increment for last token in sentence if not followed by whtespace
                index += 1
                if len(word) > 0:
                    token = Token(word, start_position=index-len(word))
                    self.add_token(token)
Exemple #30
0
    def get_documents(self, path, doc_length):
        '''
        Create document list from input documents.
        '''
        print('Processing documents ...')
        docs = []
        for filename in [
                f for f in os.listdir(path) if f[-4:] in ['.txt', '.xml']
        ]:
            with open(path + '/' + filename) as f:
                print('Processing file: ' + filename)

                # Remove xml tags and decode
                if filename.endswith('.xml'):
                    xml = etree.fromstring(f.read())
                    text = etree.tostring(xml, encoding='utf-8', method='text')
                    doc = text.decode('utf-8')
                else:
                    doc = self.decode(f.read())

                # Process user provided regular expressions
                for regex in self.regex_list:
                    doc = re.sub(regex[0], regex[1], doc, flags=re.I)

                # Remove unwanted characters and whitespace
                unwanted_chars = [
                    '&', '/', '|', '_', ':', '=', '(', ')', '[', ']'
                ]
                for char in unwanted_chars:
                    doc = doc.replace(char, '')
                doc = ' '.join(doc.split())

                # Sentence chunk with Segtok
                sentences = [s for s in segmenter.split_single(doc)]

                # Split large documents into smaller parts
                if doc_length > 0:
                    sub_docs = [
                        sentences[i:i + doc_length]
                        for i in xrange(0, len(sentences), doc_length)
                    ]
                else:
                    sub_docs = [sentences]

                # Tokenize with Segtok or Frog
                for sub_doc in sub_docs:
                    tokens = []
                    if self.pos_tag:
                        tokens += self.frogger(sub_doc, filename)
                    else:
                        for sentence in sub_doc:
                            tokens += [
                                t.lower()
                                for t in tokenizer.word_tokenizer(sentence)
                            ]
                    if len(tokens):
                        docs.append(tokens)

        for filename in [f for f in os.listdir(path) if f.endswith('.json')]:
            with open(path + '/' + filename) as f:
                print('Processing file: ' + filename)
                docs += json.load(f)['docs']

        print('Number of (sub)documents: ' + str(len(docs)))
        assert docs, 'No documents found'

        return docs
Exemple #31
0
 def segment(self, sentence):
     return word_tokenizer(sentence)
Exemple #32
0
def segtok_tokenize(text):
    from segtok.tokenizer import word_tokenizer

    chunks = word_tokenizer(text)
    return find_substrings(chunks, text)
Exemple #33
0
    def __init__(
        self,
        text: str = None,
        use_tokenizer: bool = False,
        labels: Union[List[Label], List[str]] = None,
        language_code: str = None,
    ):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None:
            self.add_labels(labels)

        self._embeddings: Dict = {}

        self.language_code: str = language_code

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = (running_offset +
                                          1 if running_offset > 0 else
                                          running_offset)

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # add each word in tokenized string as Token object to Sentence
                word = ""
                index = -1
                for index, char in enumerate(text):
                    if char == " ":
                        if len(word) > 0:
                            token = Token(word,
                                          start_position=index - len(word))
                            self.add_token(token)

                        word = ""
                    else:
                        word += char
                # increment for last token in sentence if not followed by whtespace
                index += 1
                if len(word) > 0:
                    token = Token(word, start_position=index - len(word))
                    self.add_token(token)

        # log a warning if the dataset is empty
        if text == "":
            log.warn(
                "ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?"
            )
Exemple #34
0
 def __init__(self,
              text: str = None,
              use_tokenizer: bool = False,
              labels: Union[(List[Label], List[str])] = None,
              language_code: str = None):
     super(Sentence, self).__init__()
     self.tokens = []
     self.labels = []
     if (labels is not None):
         self.add_labels(labels)
     self._embeddings = {}
     self.language_code = language_code
     if (text is not None):
         if use_tokenizer:
             tokens = []
             sentences = split_single(text)
             for sentence in sentences:
                 contractions = split_contractions(word_tokenizer(sentence))
                 tokens.extend(contractions)
             index = text.index
             running_offset = 0
             last_word_offset = (-1)
             last_token = None
             for word in tokens:
                 try:
                     word_offset = index(word, running_offset)
                     start_position = word_offset
                 except:
                     word_offset = (last_word_offset + 1)
                     start_position = ((running_offset + 1) if
                                       (running_offset > 0) else
                                       running_offset)
                 token = Token(word, start_position=start_position)
                 self.add_token(token)
                 if (((word_offset - 1) == last_word_offset)
                         and (last_token is not None)):
                     last_token.whitespace_after = False
                 word_len = len(word)
                 running_offset = (word_offset + word_len)
                 last_word_offset = (running_offset - 1)
                 last_token = token
         else:
             word = ''
             index = (-1)
             for (index, char) in enumerate(text):
                 if (char == ' '):
                     if (len(word) > 0):
                         token = Token(word,
                                       start_position=(index - len(word)))
                         self.add_token(token)
                     word = ''
                 else:
                     word += char
             index += 1
             if (len(word) > 0):
                 token = Token(word, start_position=(index - len(word)))
                 self.add_token(token)
     if (text == ''):
         log.warn(
             'ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?'
         )
     self.tokenized = None
Exemple #35
0
def _get_aligned_tokens_core(original_text, original_aligned, noisy_aligned,
                             edit_ops):

    #TODO: empty noisy text!!
    status = True

    from segtok.tokenizer import word_tokenizer  #, split_contractions
    tokenized_original_text = _split_contractions(
        word_tokenizer(original_text))

    aligned_tokens = list()

    idx = 0
    for clean_token_text in tokenized_original_text:

        noisy_token_text = ""

        token_idx = 0

        # loop till the first char of the token match with the character of
        # the aligned original text. It will skip spurious tokens that could
        # arise from insertion errors between tokens.
        while token_idx < len(clean_token_text) and idx < len(edit_ops):

            char_token = clean_token_text[token_idx]
            char_orig = original_aligned[idx]

            if char_token == char_orig:
                break

            idx += 1

        while token_idx < len(clean_token_text) and idx < len(edit_ops):

            op = edit_ops[idx]

            char_token = clean_token_text[token_idx]
            char_orig = original_aligned[idx]

            if op == "-":
                noisy_token_text += noisy_aligned[idx]
                token_idx += 1
                check = True
            elif op == "s":
                noisy_token_text += noisy_aligned[idx]
                token_idx += 1
                check = True
            elif op == "i":
                noisy_token_text += noisy_aligned[
                    idx]  # insert char and do not move to the next one
                check = False
            elif op == "d":
                token_idx += 1  # skip char and move to the next one
                check = True

            if check and char_orig != char_token:
                print(f"WRONG!!! idx={idx} {char_orig} != {char_token}")
                print(f"noisy_token_text={noisy_token_text}")
                status = False

            idx += 1

        # the next char is a whitespace (if we are not at the end of a sentence)
        # check whether it is substituted with another character, which will be
        # included into the noisy token text

        if idx < len(edit_ops) and edit_ops[idx] == "s" and original_aligned[
                idx].isspace():
            noisy_token_text += noisy_aligned[idx]
            idx += 1

        # alternatively, there could be one or more insertions at the end
        # include them into the noisy token text
        while idx < len(edit_ops) and edit_ops[idx] == "i":
            noisy_token_text += noisy_aligned[idx]
            idx += 1

        if idx < len(edit_ops) and edit_ops[idx] in [
                "-", "d"
        ] and original_aligned[idx].isspace():
            idx += 1

        aligned_tokens.append((clean_token_text, noisy_token_text))

        # if clean_token_text != noisy_token_text:
        #     log.info(f"*{clean_token_text}* -> *{noisy_token_text}*")

    if not status:
        print(f"{original_text}")
        print(f"{original_aligned}")
        print(f"{noisy_aligned}")
        print(f"{edit_ops}")
        print(f"{tokenized_original_text}")
        print(f"{aligned_tokens}")
        exit(-1)

    return aligned_tokens, status
Exemple #36
0
def extract_noisy_corpus(input_path,
                         log,
                         max_lines=-1,
                         split_num_lines=int(3e6)):

    from segtok.tokenizer import word_tokenizer
    from pysia.align import _split_contractions

    fname, fext = os.path.splitext(input_path)
    max_lines_str = get_max_lines_alias(max_lines)
    org_dir = f"{fname}_org_{max_lines_str}"
    rec_dir = f"{fname}_rec_{max_lines_str}"

    log.info(f"Starting corpus extraction:")
    log.info(f"Input data directory: {input_path}")
    log.info(f"Original data directory: {org_dir}")
    log.info(f"Noisy data directory: {rec_dir}")

    recreate_directory(org_dir)
    recreate_directory(rec_dir)

    org_file_idx, rec_file_idx = 0, 0
    org_line_idx, rec_line_idx = 0, 0
    #org_line_limit, rec_line_limit = split_num_lines / 10, split_num_lines / 10 # first split for validation
    org_line_limit, rec_line_limit = split_num_lines, split_num_lines

    org_file_path = os.path.join(org_dir, f"{org_file_idx:04d}_org.txt")
    rec_file_path = os.path.join(rec_dir, f"{org_file_idx:04d}_rec.txt")

    log.info(f"opening '{org_file_path}' for writing..")
    org_file = open(org_file_path, "w")

    log.info(f"opening '{rec_file_path}' for writing..")
    rec_file = open(rec_file_path, "w")

    num_org_lines, num_rec_lines = 0, 0

    with open(input_path, "r") as input_file:

        line = input_file.readline()
        line_idx = 0

        while line:

            tokens = _split_contractions(word_tokenizer(line.strip()))
            line = ' '.join([tok.strip() for tok in tokens])

            if line_idx % 3 == 0:  # header line
                elems = line.split(';')
                if len(elems) != 3:
                    log.error(f"Line: '{line}' length != 3'")
                    exit(-1)

            elif line_idx % 3 == 1:  # original text
                print(line.strip(), file=org_file)
                org_line_idx += 1
                num_org_lines += 1
                if org_line_idx >= org_line_limit:
                    org_file.close()
                    org_file_idx += 1
                    org_file_path = os.path.join(
                        org_dir, f"{org_file_idx:04d}_org.txt")
                    log.info(f"opening '{org_file_path}' for writing..")
                    org_file = open(org_file_path, "w")
                    org_line_idx = 0
                    org_line_limit = split_num_lines

            elif line_idx % 3 == 2:  # recognized text
                print(line.strip(), file=rec_file)
                rec_line_idx += 1
                num_rec_lines += 1
                if rec_line_idx >= rec_line_limit:
                    rec_file.close()
                    rec_file_idx += 1
                    rec_file_path = os.path.join(
                        rec_dir, f"{rec_file_idx:04d}_rec.txt")
                    log.info(f"opening '{rec_file_path}' for writing..")
                    rec_file = open(rec_file_path, "w")
                    rec_line_idx = 0
                    rec_line_limit = split_num_lines

            if max_lines > 0:
                num_lines = min(num_org_lines, num_rec_lines)
                if num_lines >= max_lines:
                    break

            line_idx += 1
            line = input_file.readline()

    org_file.close()
    rec_file.close()

    log.info(f"Loaded {line_idx} lines.")