コード例 #1
0
def computeSentenceSimilarityFeatures(sentence1, sentence2):
    features = [0] * 8
    tokenizer = RegexpTokenizer(r'\w+')
    words1 = tokenizer.tokenize(sentence1)
    words2 = tokenizer.tokenize(sentence2)
    n = len(words1)
    m = len(words2)

    # word overlap features
    count = 0  # num of same words in sentence
    for word1 in words1:
        for word2 in words2:
            if word1 == word2:
                count += 1

    # TODO: Make it symmetric (improvement?)
    features[0] = count / n  # "precision"
    features[1] = count / m  # "recall"

    features[2] = sentence_bleu([sentence1], sentence2)
    features[3] = sentence_bleu([sentence2], sentence1)

    # Obtain pairs of adjacent words
    skipgrams1 = skipgrams(words1, 2, 0)
    skipgrams2 = skipgrams(words2, 2, 0)

    count = 0
    for gram1 in skipgrams1:
        for gram2 in skipgrams2:
            if gram1 == gram2:
                count += 1

    features[4] = count / combinations(n, count)
    features[5] = count / combinations(m, count)
    """if (n > m):
        features[6] = m / n
    else:
        features[6] = n / m"""

    if len(sentence1) > len(sentence2):
        features[7] = len(sentence2) / len(sentence1)
    else:
        features[7] = len(sentence1) / len(sentence2)
    """count = 0
    for word2 in words2:
        p = paraphraseMap[word2] if word2 in paraphraseMap else 0
        q = notParaphrMap[word2] if word2 in notParaphrMap else 0
        if q == 0:
            kl = 1
        elif p == 0:
            kl = 0
        else:
            kl = p * math.log(p/q) + (1-p) * math.log((1-p)/(1-q))
        for word1 in sentence1:
            if word1 == word2:
                count += kl
    features[7] = count / n
    features[8] = count / m"""

    return features
コード例 #2
0
def kskip_ngram_cuoccurrence(answer, example, ng, kskip, question_index):
    distance_over_ngrams = 0
    # nltk skipgrams function doesn't include unigrams, so they are calculated separately:
    summ_distance = 0
    for word_ex in example:
        dist = 0
        for word_an in answer:
            dist_tmp = distance.levenshtein(word_ex, word_an)
            if dist_tmp > dist:
                dist = dist_tmp
        summ_distance += dist
    try:
        distance_over_ngrams += summ_distance / len(example)
    except:
        print("zero")
        print(question_index)
        distance_over_ngrams += 0
    for n in range(2, ng + 1):
        answer_ngramed = list(skipgrams(answer, n, kskip))
        example_ngramed = list(skipgrams(example, n, kskip))
        summ_distance = 0
        for ngram_of_example in example_ngramed:
            dist = 0
            for ngram_of_answer in answer_ngramed:
                dist_tmp = distance.levenshtein(ngram_of_example, ngram_of_answer)
                if dist_tmp > dist:
                    dist = dist_tmp
            summ_distance += dist
        try:
            distance_over_ngrams += summ_distance / len(example_ngramed)
        except:
            distance_over_ngrams += 0
    return distance_over_ngrams
コード例 #3
0
def calculate_rouge_s_score(sent, gold_standard_summary, predicted_summary, n):
    gold_skipgrams = list(skipgrams(gold_standard_summary, n, n))
    pred_skipgrams = list(skipgrams(predicted_summary, n, n))

    # Find common skipgrams
    common_skipgrams = set(gold_skipgrams).intersection(set(pred_skipgrams))

    # Find mC2 for calculating the precision, recall and F1 score
    r = min(2, len(gold_standard_summary) - 2)
    if r == 0:
        return 1

    if len(gold_standard_summary) != 0:
        numerator = reduce(op.mul, xrange(len(gold_standard_summary), len(gold_standard_summary) - r, -1))
        denominator = reduce(op.mul, xrange(1, r + 1))
        gold_skipgram_combinations = numerator // denominator
    else:
        gold_skipgram_combinations = 0

    # Find nC2 for calculating the precision, recall and F1 score
    if len(predicted_summary) != 0:
        r = min(2, len(predicted_summary) - 2)
        if r == 0:
            return [1, 1, 1]
        numerator = reduce(op.mul, xrange(len(predicted_summary), len(predicted_summary) - r, -1))
        denominator = reduce(op.mul, xrange(1, r + 1))
        pred_skipgram_combinations = numerator // denominator
    else:
        pred_skipgram_combinations = 0

    if gold_skipgram_combinations != 0:
        recall_skipgram = len(common_skipgrams) / gold_skipgram_combinations
    else:
        recall_skipgram = 0

    if pred_skipgram_combinations != 0:
        precision_skipgram = len(common_skipgrams) / pred_skipgram_combinations
    else:
        precision_skipgram = 0

    if recall_skipgram != 0:
        beta_value = precision_skipgram / recall_skipgram       # Or beta can be hardcoded as 1
    else:
        beta_value = 1

    if (recall_skipgram + ((beta_value ** 2) * precision_skipgram)) != 0:
        f1_skipgram = ((1 + (beta_value ** 2)) * recall_skipgram * precision_skipgram) / (recall_skipgram + ((beta_value ** 2) *
                                                                                                    precision_skipgram))
    else:
        f1_skipgram = 0
    # print "Recall of ROUGE-S for sentence", sent, "is", recall_skipgram
    # print "Precision of ROUGE-S for sentence", sent, "is", precision_skipgram
    # print "F1 score of ROUGE-S for sentence", sent, "is", f1_skipgram
    return [round(precision_skipgram, 4), round(recall_skipgram, 4), round(f1_skipgram, 4)]
コード例 #4
0
def rougeS(candidateSummary, refrenceSummaries):
  B = 1

  candidateTokens = tokenize(candidateSummary)
  skip2  = lambda s: len(set(skipgrams(candidateTokens,2,2)) & set(skipgrams(tokenize(s), 2, 2)))

  Rskip2 = lambda s: float(skip2(s)) / nCr(len(candidateTokens), 2)
  Pskip2 = lambda s: float(skip2(s)) / nCr(len(tokenize(s)), 2)

  Fskip2 = lambda s: (1 + B ** 2) * Rskip2(s) * Pskip2(s) / ( Rskip2(s) + B ** 2 * Pskip2(s) ) if skip2(s) > 0 else 0

  return max(map(Fskip2, refrenceSummaries))
コード例 #5
0
def rougeS(candidateSummary, refrenceSummaries):
    B = 1

    candidateTokens = tokenize(candidateSummary)
    skip2 = lambda s: len(
        set(skipgrams(candidateTokens, 2, 2)) & set(
            skipgrams(tokenize(s), 2, 2)))

    Rskip2 = lambda s: float(skip2(s)) / nCr(len(candidateTokens), 2)
    Pskip2 = lambda s: float(skip2(s)) / nCr(len(tokenize(s)), 2)

    Fskip2 = lambda s: (1 + B**2) * Rskip2(s) * Pskip2(s) / (Rskip2(
        s) + B**2 * Pskip2(s)) if skip2(s) > 0 else 0

    return max(map(Fskip2, refrenceSummaries))
コード例 #6
0
    def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [x.text for x in self._tokenizer.tokenize(premise)]
        hypothesis_tokens = [
            x.text for x in self._tokenizer.tokenize(hypothesis)
        ]
        # n-grams from the premise
        prem_trigrams = set(skipgrams(premise_tokens, 3, 1))
        prem_bigrams = set(skipgrams(premise_tokens, 2, 1))
        prem_unigrams = set(ngrams(premise_tokens, 1))

        # n-grams from the hypothesis
        hyp_trigrams = set(skipgrams(hypothesis_tokens, 3, 1))
        hyp_bigrams = set(skipgrams(hypothesis_tokens, 2, 1))
        hyp_unigrams = set(ngrams(hypothesis_tokens, 1))

        # overlap proportions
        if hyp_trigrams:
            tri_overlap = len(
                prem_trigrams.intersection(hyp_trigrams)) / len(hyp_trigrams)
        else:
            0.0
        if hyp_bigrams:
            bi_overlap = len(
                prem_bigrams.intersection(hyp_bigrams)) / len(hyp_bigrams)
        else:
            0.0
        if hyp_unigrams:
            uni_overlap = len(
                prem_unigrams.intersection(hyp_unigrams)) / len(hyp_unigrams)
        else:
            0.0

        fields['features'] = FeaturesField(
            [tri_overlap, bi_overlap, uni_overlap])
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': premise_tokens,
            'hypothesis_tokens': hypothesis_tokens
        }
        fields['metadata'] = MetadataField(metadata)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)
コード例 #7
0
def get_data_with_skip_n_grams(n, k):
    data = get_data()
    for index, row in data.iterrows():
        data.at[index,
                'body'] = list(skipgrams(data.loc[index, 'body'].split(), n,
                                         k))
    return data
コード例 #8
0
    def decompose(self, corpus):

        skip_counts = bounter(size_mb=1024)
        word_counts = bounter(size_mb=1024)
        for l in corpus:
            wds = l.split()
            skips = list(skipgrams(wds, 2, 5))
            skips = ["#".join(t) for t in skips]
            if len(wds) > 0 and len(skips) > 0:
                skip_counts.update(skips)
                word_counts.update(wds)

        vocabulary = list(word_counts)
        shift = 1  # shift 1 does nothing since log(1) == 0.0
        M = count_skipgrams(skip_counts, word_counts, vocabulary, shift)
        # TODO: eigen something trick
        # singular value decomposition
        # U, _, V = svds(M, k=256)  # U, S, V
        U, _, V = sparsesvd(M, 300)
        # add context to U
        word_vecs = U.T + V.T
        del U
        del V
        # normalize rows
        word_vecs_norm = word_vecs / np.sqrt(
            np.sum(word_vecs * word_vecs, axis=0, keepdims=True))
        del word_vecs
        return vocabulary, word_vecs_norm
コード例 #9
0
 def Word_k_skip_n_gram(self, k, n, words):
     words = self.word_token(words)
     if k == 0:
         ret = list(ngrams(words, n))
     else:
         ret = list(skipgrams(words, n, k))
     return ret
コード例 #10
0
ファイル: logique.py プロジェクト: Krolov18/Lexiques
def g(mot):
    # from re import sub
    from nltk.util import skipgrams
    tmp = [x for x, y in enumerate(mot)]
    for i in range(2, len(mot) + 1):
        for skip in skipgrams(tmp, i, len(mot)):
            # yield sub("\.\.+", ".+", replace(mot, skip, '.'))
            yield skip
コード例 #11
0
def computeSimple(sentence1, sentence2):
    features = [0] * 7
    tokenizer = RegexpTokenizer(r'\w+')
    words1 = tokenizer.tokenize(sentence1)
    words2 = tokenizer.tokenize(sentence2)
    n = len(words1)
    m = len(words2)

    # word overlap features
    count = 0 # num of same words in sentence
    for word1 in words1:
        for word2 in words2:
            if word1 == word2:
                count += 1

    features[0] = count / n # "precision"
    features[1] = count / m # "recall"

    features[2] = sentence_bleu([sentence1], sentence2)
    features[3] = sentence_bleu([sentence2], sentence1)

    # Obtain pairs of adjacent words
    skipgrams1 = skipgrams(words1, 2, 0)
    skipgrams2 = skipgrams(words2, 2, 0)

    count = 0
    for gram1 in skipgrams1:
        for gram2 in skipgrams2:
            if gram1 == gram2:
                count += 1

    features[4] = count / combinations(n, count)
    features[5] = count / combinations(m, count)


    """if (n > m):
        features[6] = m / n
    else:
        features[6] = n / m"""

    if len(sentence1) > len(sentence2):
        features[7] = len(sentence2) / len(sentence1)
    else:
        features[7] = len(sentence1) / len(sentence2)

    return features
コード例 #12
0
def k_skip_n_grams(sent, k, n):
    """
    Apply the skipgrams method from NLTK and return the results in list format.
    :param sent: input sentence in which to look for k-skip-n-grams
    :param k: k parameter from k-skip-n-grams
    :param n: n parameter from k-skip-n-grams
    :return: a list containing all the k-skip-n-grams found in sent
    """
    return list(skipgrams(sent, k=k, n=n))
コード例 #13
0
    def compute_skipgrams(self, win_size, min_freq=0, min_sg=0):
        print('Counting skipgrams and building vocab \n')
        # Initialize variables
        tok2indx = dict()
        unigram_counts = Counter()
        doc_all_clean = []
        skipgrams_list = []
        win_size_ext = win_size // 2
        # Loop through text and preprocess
        for doc_indx, doc in enumerate(self.text):
            if doc_indx % 10000 == 0:
                print('{} reviews processed'.format(doc_indx))
            # Update unigram counts
            for token in doc:
                unigram_counts[token] += 1
                if token not in tok2indx:
                    tok2indx[token] = len(tok2indx)
            # Update Skipgram counts
            skipgrams_list.append(list(skipgrams(doc, 2, win_size_ext)))

        # Flatten skip grams into flat list
        skipgram_counts = Counter([
            skipgram for skip_list in skipgrams_list for skipgram in skip_list
        ])
        # Given the bottom-up process used to count unigrams and skipgrams, we
        # have to threshold out words that don't meet the user-specified min_freq
        # threshold after the fact. Need a way to optimize this section of code
        if min_freq > 0:
            # Threshold unigram counts based off 'min_freq'
            unigram_counts = Counter({
                token: unigram_counts[token]
                for token in unigram_counts
                if unigram_counts[token] >= min_freq
            })
            # Create new tok2indx
            tok2indx = {
                token: count
                for count, token in enumerate(unigram_counts.keys())
            }
            # Remove skipgrams that contain removed unigram tokens
            # This is poorly optimized, need a better solution
            skipgram_counts = Counter({
                sg_tuple: count
                for sg_tuple, count in skipgram_counts.items()
                if all(token in unigram_counts.keys() for token in sg_tuple)
                if count >= min_sg
            })
        # Set necessary objects to class object
        self.tok2indx = tok2indx
        self.skipgram_counts = skipgram_counts
        # Print unigram counts
        print('vocabulary size: {}'.format(len(unigram_counts)))
        print('most common: {} \n'.format(unigram_counts.most_common(10)))
        # Print skipgram counts
        print('number of skipgrams: {}'.format(len(skipgram_counts)))
        print('most common: {} \n'.format(skipgram_counts.most_common(10)))
コード例 #14
0
    def make_cooccurrence_matrix(self, k, dtype=np.float32):
        X = np.zeros((self.ntokens, self.ntokens), dtype=np.float32)
        word_int, int_word = hash_vocabulary(self.vocabulary)

        for i, S in enumerate(self.behavioural_ngrams):
            skipgram_generator = skipgrams(S, 2, k)
            for s in skipgram_generator:
                idx1 = word_int[s[0]]
                idx2 = word_int[s[1]]
                X[idx1, idx2] += 1

        self.cooccurrence_matrix = X
コード例 #15
0
ファイル: utils.py プロジェクト: ryparmar/master-thesis
def create_skipgrams(data, cue: str, skip: int, tokenizer=None):
    """
    Cue is represented by skipgram.
    Creates skipgrams and cue counts per label and per document.

    cue = Cue representation: bigram, trigram
    skip = number of skipped tokens
    Note: if skip == 4, then skipgrams function generates all the skipgrams with 0, 1, 2, 3 and 4 skipped tokens.
    
    Returns:
    skipgrams_per_label = {'V Hradci': {'Supports': 4, 'Refutes': 1}, ...}
    skipgrams_total = {'V Hradci': 5, ...}
    skipgrams_document_frequency = {'V Hradci': 2, ...}
    total_documents = total number of documents
    """
    skipgrams_per_label, skipgrams_total = {}, {}
    skipgrams_document_frequency, total_documents = {}, len(data['claim'])
    rep2int = {'unigram': 1, 'wordpiece': 1, 'bigram': 2, 'trigram': 3}
    for i, claim in enumerate(data['claim']):
        # TODO rewrite -- added expost and slightly dumb? (calculating same thing as in the applicability?!)
        _skipgrams = (skipgrams(claim.split(), rep2int[cue], skip)
                      if rep2int[cue] > 1 else claim_to_unigrams(claim) if cue
                      == 'unigram' else claim_to_wordpieces(claim, tokenizer))
        for skipgram in _skipgrams:
            skipgram = " ".join(
                list(skipgram)) if rep2int[cue] > 1 else "".join(
                    list(skipgram))
            # Count skipgrams per cue
            if skipgram in skipgrams_total:
                skipgrams_total[skipgram] += 1
            else:
                skipgrams_total[skipgram] = 1
            if skipgram in skipgrams_per_label:
                if data['label'][i] in skipgrams_per_label[skipgram]:
                    skipgrams_per_label[skipgram][data['label'][i]] += 1
                else:
                    skipgrams_per_label[skipgram][data['label'][i]] = 1
            else:
                skipgrams_per_label[skipgram] = {data['label'][i]: 1}

            # Count document frequency per cue
            if skipgram in skipgrams_document_frequency:
                skipgrams_document_frequency[skipgram].add(i)
            else:
                skipgrams_document_frequency[skipgram] = set([i])

    # Count the distinct docs
    for k, v in skipgrams_document_frequency.items():
        skipgrams_document_frequency[k] = len(v)

    return skipgrams_per_label, skipgrams_total, skipgrams_document_frequency, total_documents
コード例 #16
0
ファイル: preprocess.py プロジェクト: schan27/DMT
def skipgram_tokenize(sentence,
                      n=None,
                      k=None,
                      include_all=False,
                      analyzer='word'):
    def basic_tokenize(sentence, analyzer='word'):
        if analyzer == 'word':
            return sentence.split()
        else:
            sentence = sentence.replace(' ', '')
            return list(sentence)

    from nltk.util import skipgrams
    tokens = [w for w in basic_tokenize(sentence, analyzer)]
    if include_all:
        result = []
        for i in range(k + 1):
            skg = [w for w in skipgrams(tokens, n, i)]
            result = result + skg
    else:
        result = [w for w in skipgrams(tokens, n, k)]
    result = set(result)
    return result
コード例 #17
0
def extract_sentence(text):
    for lst in TextBlob(text).ngrams(2):
        if 'months' in lst or 'years' in lst:
            length = ''
            try:
                length = int(DIGITS.get(lst[0], lst[0]))
            except ValueError as e:
                print(e)
                pass
            # print('{} {}'.format(' '.join(lst), length))
            return ' '.join(lst)
        if 'life' in lst:
            print(text)
            for s in skipgrams(text.split(), 3, 2):
                if 'sentenced' == s[0]:
                    pass
                    # print(s)
            print('\n\n')
    return ' '.join(lst)
コード例 #18
0
ファイル: nltk.py プロジェクト: thePortus/dhelp
    def skipgrams(self, gram_size=3, skip_size=1):
        """Gives skipgrams.

        Returns list of skipgrams, similar to ngram, but allows spacing between
        tokens.

        Args:
            gram_size (:obj:`int`, optional) Size of the ngrams to generate
            skip_size (:obj:`int`, optional) Size of max spacing allowed

        Returns:
            :obj:`list` of :obj:`tuple` Words of each skipgram

        Example:
            >>> text = EnglishText('They hated to think of sample sentences.')
            >>> basic_skipgrams = text.skipgrams()
            >>> print(basic_skipgrams)
            [('They', 'hated', 'to'), ('They', 'hated', 'think'), ('They', 'to', 'think'), ('hated', 'to', 'think'), ('hated', 'to', 'of'), ('hated', 'think', 'of'), ('to', 'think', 'of'), ('to', 'think', 'sample'), ('to', 'of', 'sample'), ('think', 'of', 'sample'), ('think', 'of', 'sentences'), ('think', 'sample', 'sentences'), ('of', 'sample', 'sentences'), ('of', 'sample', '.'), ('of', 'sentences', '.'), ('sample', 'sentences', '.')] # noqa
        """
        tokens = self.tokenize()
        return list(skipgrams(tokens, gram_size, skip_size))
コード例 #19
0
ファイル: wordnet.py プロジェクト: MOC-IP/SuperView-processor
def get_word_context_scores(reviews,
                            n=2,
                            k=2,
                            min_instances=None,
                            min_percentage=1.0):
    if min_instances is None:
        min_instances = min([5, int((len(reviews) / 100) * min_percentage)])

    word2context_score = dict()
    for review in reviews:
        i = -1
        stars = review['stars']
        skipgrams_list = skipgrams(review["words"], n, k)
        for skipgram in skipgrams_list:
            for i in range(len(skipgram) - 1):
                word1 = skipgram[i]
                for j in range(i + 1, len(skipgram)):
                    word2 = skipgram[j]
                    if word1 not in word2context_score:
                        word2context_score[word1] = dict()
                    if word2 not in word2context_score:
                        word2context_score[word2] = dict()
                    if word1 not in word2context_score[word2]:
                        word2context_score[word1][word2] = [0.0, 0]
                    if word2 not in word2context_score[word1]:
                        word2context_score[word2][word1] = [0.0, 0]
                    word2context_score[word1][word2][0] += stars
                    word2context_score[word1][word2][1] += 1
                    word2context_score[word2][word1][0] += stars
                    word2context_score[word2][word1][1] += 1

    for word1 in word2context_score:
        instances = 0
        for word2 in word2context_score[word1]:
            instances += word2context_score[word1][word2][1]
            if instances >= min_instances:
                break
        if instances < min_instances:
            del word2context_score[word1]
    return word2context_score
コード例 #20
0
def skipgram_analyzer(s, skipgram_list=[(1, 0)]):
    '''
    An analyzer that splits a string s into a list of (n, k) skipgrams
    for each (n, k) pair in the skipgrams list.

    Because skipsgrams also produce all the ngrams without skips, it
    is very easy for this function to produce repeated data.  One must
    be careful of what is passed in as the skipgram list.  A good
    start would be [(1, 0), (2, 3), (3, 2)].  But notice that in [(1,
    0), (2, 0), (2, 3)] the tuple (2, 0) is redundant and shouldn't be
    passed in.
    '''
    s = word_tokenize(s.lower())
    s = list(filter(lambda c: c not in string.punctuation, s))

    ret = []
    for n, k in skipgram_list:
        if k == 0:
            ret += list(ngrams(s, n))
        else:
            ret += skipgrams(s, n, k)
    return ret
コード例 #21
0
    def generate_batch(self, file_paths, batch_size):
        samples = []
        labels = []
        unknown = self._word_to_index['UNKNOWN']
        for file_path in file_paths:
            with open(file_path, 'r', errors='ignore') as _file:
                for paragraph in _file:
                    sentences = normalise_line(paragraph)
                    for sentence in sentences:
                        for sample, label in skipgrams(sentence.split(), 2,
                                                       self.context_size):
                            samples.append(
                                self._word_to_index.get(sample, unknown))
                            labels.append(
                                [self._word_to_index.get(label, unknown)])

                            if len(samples) == batch_size:
                                yield (
                                    samples,
                                    labels,
                                )
                                samples = []
                                labels = []
コード例 #22
0
def process_caption(raw, method, most_common_):
    delete_hashtag = ' '.join(
        re.sub(r"(\.\.)|#[a-zA-Z-0-9]+", " ", raw).split())
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s',
                         delete_hashtag)
    rawData = []
    for i in sentences:
        remove_punc = re.sub(r'[^\w\s]', '', i, re.UNICODE)
        use_lowercase = " ".join(remove_punc.lower()
                                 for remove_punc in remove_punc.split())
        use_stopwords = " ".join(use_lowercase
                                 for use_lowercase in use_lowercase.split()
                                 if use_lowercase not in stopwordplus)
        token = nltk.word_tokenize(use_stopwords)
        if method == 1:
            uni = token
            rawData.extend(uni)
        elif method == 2:
            big = ngrams(token, 2)
            skip = list(skipgrams(token, 2, 1))
            rawData.extend(chain(big, skip))
        else:
            rawData.append("method input is false")
    return Counter(rawData).most_common(most_common_)
コード例 #23
0
def k_skip_n_grams(sent, k, n):
    return list(skipgrams(sent, k=k, n=n))
コード例 #24
0
def getSkipgram(sentence, n, k):
    grams = []
    for i in range(k + 1):
        words = [w for w in skipgrams(sentence, n, i)]
        grams = grams + words
    return grams
コード例 #25
0
ファイル: nltk_doc.py プロジェクト: thePortus/arakhne
 def skipgrams(self, gram_size=3, skip_size=1):
     tokens = self.tokenize()
     return list(skipgrams(tokens, gram_size, skip_size))
コード例 #26
0
ファイル: parseText.py プロジェクト: GarfieldLyu/OCR_POST_DE
 def skipgramTokenize(self, passage):
     tokens = self.tokenize(passage)
     Skipgrams = [
         ' '.join(gram) for gram in skipgrams(tokens, self.n, self.k)
     ]
     return Skipgrams
コード例 #27
0
 def __call__(self, t):
     text = self.reduce_lengthening(t)
     tokens = list(self.tknzr.tokenize(text))
     negated_tokens = mark_negation(tokens)
     list_of_skipgrams = list(skipgrams(negated_tokens, self.n, self.k))
     return list([' '.join(s) for s in list_of_skipgrams])
コード例 #28
0
print(len(vocabulary))

wd_times = {}
edges = bounter(size_mb=1024)
i = len(vocabulary)
for wd in vocabulary:
    print(f"remains {i} words")
    times = []
    for k, v in month_utterances.items():
        for e in v:
            e = e.split()
            e = [w for w in e if w in vocabulary]
            if len(e) > 1:
                if wd in set(e):
                    times.append(int(k))
                skips = list(skipgrams(e, 2, 3))
                skips = [skip[0] + "_" + skip[1] for skip in skips]
                edges.update(skips)
    if len(times) > 0:
        sdate = min(times)
        edate = max(times)
        t = (sdate, edate)
        wd_times[wd] = t
    i -= 1

G = nx.Graph()
for k, v in wd_times.items():
    G.add_node(k, start=v[0], end=191)

total = edges.total()
for e in edges.items():
コード例 #29
0
ファイル: rouge_score.py プロジェクト: 53X/nltk
def rouge_s(references,
            candidate,
            beta,
            d_skip=None,
            averaging=True,
            smoothing=False):
    '''
    It implements the ROUGE-S and ROUGE-SU scores.
    The skip-bigram concept has been used here.
    
    :param references : list of all references where all references 
                       have been tokenized into words
    :type references : list(list(str))

    :param candidate : list of word in the candidate string
    :type candidate : list(str)

    :param beta : user-defined parameter for the calculation 
                  of F1 score
    :type beta : float             
    

    :param d_skip : the skip_distance(k) parameter for skipgram
    :type d_skip : int
    

    :param smoothing : setting this to True allows for
                       ROUGE-SU implementation.The ROUGE-SU
                       implementation helps in the unigram
                       smoothing.
    :type smoothing : boolean

    :param averaging : Jacknifing occurs if averaging is True
    :type averaging : Boolean


     
    k_c : distance parameter for candidate in the skipgram
    
    k_ref : distance parameter for reference in the skipgram
    
    cand_skipgram : list of all skipgrams of the candidate
    
    ref_skipgram : list of all skipgrams of the reference
    
    r_skip : recall factor
    
    p_skip : precision factor
    
    score : rouge-s(or SU) score between a reference and the candidate

    count : variable counting the no. of matching skipgrams between the 
            candidate and the reference

    rouge_s_list : list of the Rouge-S ( or SU) scores
                   for every reference and the candidate
    
    '''
    rouge_s_list = []

    k_c = len(candidate) if d_skip is None else d_skip
    cand_skipgram = list(skipgrams(candidate, n=2, k=k_c))

    for ref in references:  #iterating over each reference

        k_ref = len(ref) if d_skip is None else d_skip
        ref_skipgram = list(skipgrams(ref, n=2, k=k_ref))

        count = 0
        for bigram in cand_skipgram:
            if bigram in ref_skipgram:
                count += 1
        ''' 
        Calculating ROUGE-S  precision and recall factors:

        '''

        r_skip = count / len(ref_skip_list)
        p_skip = count / len(cand_skip_list)

        if smoothing:
            '''
            Calculating ROUGE-SU by applying unigram smoothing

            '''

            for unigram in candidate:  # iterating over candidate unigrams

                if unigram in ref:  # checking the presence of common of unigrams with reference
                    count += 1

            r_skip = count / (len(ref_skip_list) + len(ref))
            p_skip = count / (len(cand_skip_list) + len(cand))

        score = get_score(r_skip, p_skip, beta)
        rouge_s_list.append(score)
    return jacknifing(rouge_s_list, averaging=averaging)
コード例 #30
0
def main():
    # Start the counter
    start = time.time()

    # Load raw data and tokenize
    corpus_file = str(sys.argv[1])
    print("Processing...")
    merged = []
    no_words = 0
    window_size = 2
    skipgram_list = []

    def read_words(inputfile):
        with open(inputfile, 'r', encoding='utf-8') as f:
            while True:
                buf = f.read(102400)
                if not buf:
                    break
                while not str.isspace(buf[-1]):
                    ch = f.read(1)
                    if not ch:
                        break
                    buf += ch
                words = buf.split()
                for word in words:
                    yield word
            yield ''

    no_lines = 0

    line_tokens = []
    for word in read_words(corpus_file):
        no_lines += 1
        if '.' not in word:
            line_tokens.append(
                word.translate(str.maketrans('', '', string.punctuation)))
        else:
            line_tokens.append(
                word.translate(str.maketrans('', '', string.punctuation)))
            merged.append(line_tokens)
            line_tokens = []
        if no_lines % 100000 == 0:
            sys.stdout.write("\rWords: %i" % int(no_lines))
            sys.stdout.flush()

    # Function to create a dataframe with counts and probabilities
    def create_count_df(list_to_count, skipgrams, sample_rate):
        list_with_counts = collections.Counter(list_to_count)
        df = pd.DataFrame()
        df2 = pd.DataFrame()
        if skipgrams == False:
            df['word'] = list_with_counts.keys()
            df['count'] = list_with_counts.values()
            df = df[df['count'] > 4]
            df['prob'] = df['count'] / sum(df['count'])
            # calculate negative sample probability
            df['weight'] = df['prob']**(3 / 4)
            df['neg_samp_prob'] = df['weight'] / sum(df['weight'])
            # subsample
            df['prob_sub'] = (np.sqrt(df['prob'] / sample_rate) +
                              1) * sample_rate / df['prob']
            df2['word'] = list_to_count
            df2 = df2.join(df.set_index('word'), on='word')
            df = df[df['prob_sub'] > sample_rate]
            df2 = df2[df2['count'] > 4]
            df2 = df2[df2['prob_sub'] > sample_rate]
            df = df[['word', 'prob', 'neg_samp_prob']]
            return df, df2['word'].tolist()
        else:
            word_list1 = []
            word_list2 = []
            for item in list_with_counts.keys():
                word_list1.append(item[0])
                word_list2.append(item[1])
            df['word1'] = word_list1
            del word_list1
            df['word2'] = word_list2
            del word_list2
            df['count'] = list_with_counts.values()
            df['prob'] = df['count'] / sum(df['count'])
            df = df[['word1', 'word2', 'prob']]
            return df

    # Create the list of unigrams with the count and normalize probability
    print("\nCreating the list of unigrams...")
    sample_rate = 0.001
    unigram_df, tokens = create_count_df(
        [item for sublist in merged for item in sublist], False, sample_rate)
    print("# unigrams: ", unigram_df.shape[0])
    print("Creating the list of skipgrams...")
    no_words = 0
    A = ahocorasick.Automaton()
    for idx, key in enumerate(tokens):
        A.add_word(key, (idx, key))
    for line_counter, line in enumerate(merged):
        line_skipgrams = list(skipgrams(line, window_size, window_size))
        for skipgram in line_skipgrams:
            if (skipgram[0] in A) and (skipgram[1] in A):
                skipgram_list.append(skipgram)
        if no_lines % 100000 == 0:
            sys.stdout.write("\rWords: %i" % int(no_words))
            sys.stdout.flush()
            no_words += len(line)
    del tokens
    del merged
    print("\nCreating skipgram data frame...")
    skipgram_df = create_count_df(skipgram_list, True, sample_rate)
    print("# skipgrams: ", skipgram_df.shape[0])
    del skipgram_list

    # Optimize the skipgram dataframe to reduce the size by ~ 90%
    print("Optimizing...")
    skipgram_df['word1'] = skipgram_df.word1.astype('category')
    skipgram_df['word2'] = skipgram_df.word2.astype('category')
    skipgram_df['prob'] = skipgram_df.prob.astype('float32')

    # Calculate PMI values for each skipgram
    print("Calculating PMI...")
    unigram_df = unigram_df.set_index('word')
    skipgram_df['prob1'] = skipgram_df['word1'].map(
        unigram_df['prob'].get).astype('float32')
    skipgram_df['prob2'] = skipgram_df['word2'].map(
        unigram_df['prob'].get).astype('float32')
    skipgram_df['pmi'] = np.log(
        skipgram_df['prob'] /
        (skipgram_df['prob1'] * skipgram_df['prob2'])).astype('float32')
    skipgram_df = skipgram_df[['word1', 'word2', 'pmi']]

    unigram_df = unigram_df.reset_index()
    vocab_length = unigram_df.shape[0]

    # Create the unigram table for negative sampling
    print("Creating negative samples table...")
    table_size = 10000000
    neg_samp_list = []
    row_index = 0
    for index, row in unigram_df.iterrows():
        rate = row['neg_samp_prob'] * table_size
        for i in range(0, int(rate)):
            neg_samp_list.append(row_index)
        row_index += 1

    # Create the list of co-occurence probabilities for the output vector
    print("Preparing empty output vectos...")
    output_vectors = np.zeros(shape=(vocab_length, vocab_length))
    unigram_index_list = list(range(0, vocab_length))
    unigram_df['index_list'] = unigram_index_list
    del unigram_index_list
    unigram_df = unigram_df.set_index('word')
    skipgram_df['word_index1'] = skipgram_df['word1'].map(
        unigram_df['index_list'].get)
    skipgram_df['word_index2'] = skipgram_df['word2'].map(
        unigram_df['index_list'].get)
    skipgram_df = skipgram_df[[
        'word_index1',
        'word_index2',
        'pmi',
    ]]
    print("Populating the output vectors for training...")
    row_count = 0
    for row in skipgram_df.itertuples(index=True):
        output_vectors[int(getattr(row, "word_index1"))][int(
            getattr(row, "word_index2"))] = getattr(row, "pmi")
        if row_count % 1000000 == 0:
            sys.stdout.write("\rProgress: %.2f %%" %
                             float(row_count / skipgram_df.shape[0]) * 100)
            sys.stdout.flush()
        row_count += 1
    unigram_df = unigram_df.reset_index()
    word_list = unigram_df['word'].tolist()
    del unigram_df

    #  Initialize the weights
    print("\nInitializing the network...")
    no_hid = int(sys.argv[2])
    epochs = 5
    neg_samp_no = 20
    starting_alpha = 0.025
    alpha = starting_alpha
    syn0 = np.random.uniform(low=-0.5 / no_hid,
                             high=0.5 / no_hid,
                             size=(vocab_length, no_hid))
    syn1 = np.zeros(shape=(vocab_length, no_hid))

    # Helper functions
    def sigmoid(x):
        return 1 / (1 + math.exp(-x))

    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    # Train the network using negative sampling
    print("\nTraining the network...")
    train_counter = 0
    for epoch in range(0, epochs):
        for row in skipgram_df.itertuples(index=True):
            u_layer = np.zeros(no_hid)
            softmax_array = []
            c_index_array = []
            w_index = int(getattr(row, "word_index1"))
            c_index_array.append(int(getattr(row, "word_index2")))
            softmax_array.append(float(getattr(row, "pmi")))
            for neg_samp in range(0, neg_samp_no):
                neg_samp_pos = random.randint(0, len(neg_samp_list) - 1)
                c_index = neg_samp_list[neg_samp_pos]
                c_index_array.append(c_index)
                softmax_array.append(output_vectors[w_index][c_index])
            softmax_array = softmax(softmax_array)
            for i in range(0, len(c_index_array)):
                c_index = c_index_array[i]
                label = softmax_array[i]
                if alpha < starting_alpha * 0.0001:
                    alpha = starting_alpha
                f1 = sigmoid(np.dot(syn0[w_index], syn1[c_index]))
                f1_error = alpha * (label - f1)
                u_layer += f1_error * syn1[c_index]
                syn1[c_index] += f1_error * syn0[w_index]
                alpha -= starting_alpha / (skipgram_df.shape[0] * epochs *
                                           neg_samp_no)
            syn0[w_index] += u_layer
            if train_counter % 10000 == 0:
                sys.stdout.write("\rProgress: %.2f %% Alpha: %.5f" %
                                 (float(train_counter /
                                        (skipgram_df.shape[0] * epochs)) * 100,
                                  float(alpha)))
                sys.stdout.flush()
            train_counter += 1

    # Save the model
    print("\nSaving the model...")
    word_list_name = '_'.join([sys.argv[3], 'wordlist.p'])
    vectors_name = '_'.join([sys.argv[3], 'vectors.hdf5'])
    output_word_list = open(word_list_name, 'wb')
    pickle.dump(word_list, output_word_list)
    output_word_list.close()
    vectors_file = h5py.File(vectors_name, 'w')
    vectors_file.create_dataset('vectors', data=syn0)
    vectors_file.close()

    # Print out overall statistics of the run
    end = time.time()
    print("Running time: ", str(int((end - start) / 60)), "minutes")
コード例 #31
0
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if (file.endswith(".txt")):
            filepath = subdir + os.sep + file
            is_female = False
            if subdir.endswith('female'):
                females += 1
                is_female = True
            elif subdir.endswith('male'):
                males += 1
            total_count += 1
            print(filepath)
            with open(filepath, 'r') as f:
                content = f.read().upper().replace('.', '').replace(
                    '!', '').replace('?', '').split(" ")
                grams = skipgrams(content, number_of_words, skip_dist)
                for w in content:
                    if len(ignore_words.intersection([w])) == 0:
                        if is_female:
                            if w in female_word_count:
                                female_word_count[w] = female_word_count[w] + 1
                            else:
                                female_word_count[w] = 1
                        else:
                            if w in male_word_count:
                                male_word_count[w] = male_word_count[w] + 1
                            else:
                                male_word_count[w] = 1
                        if w in total_word_count:
                            total_word_count[w] = total_word_count[w] + 1
                        else:
コード例 #32
0
 def _get_filtered_skipbigrams(self, words):
     filtered_skipped_bigrams = []
     for bi in skipgrams(words, 2, NGramsContainer.skip_for_skipgram):
         if not any(w for w in bi if w in stopwords) and bi[0] != bi[1]:
             filtered_skipped_bigrams.append(bi)
     return filtered_skipped_bigrams