def __init__(self):
     self.unimass = deft(float)
     self.bimass = deft(float)
     self.unigrams = Counter()
     self.bigrams = Counter()
     self.umass = 0.0
     self.bmass = 0.0
Example #2
0
 def __init__(
     self,
     min_cluster=3,
     min_wfreq=3,
     max_wfreq=0.2,
     baselines=10,
     window=10,
     novel_ratio=0.75,
     doc_overlap=0.25,
     min_assoc_multiplier=1,
     verbose=False,
 ):
     self.min_cluster = min_cluster
     self.min_wfreq = min_wfreq
     self.max_wfreq = max_wfreq
     self.baselines = baselines
     self.min_assoc = min_assoc_multiplier
     self.novel_ratio = novel_ratio
     self.doc_overlap = doc_overlap
     self.window = window
     self.bows = []
     self.window_by_wid = deft(set)
     self.is_feature = deft(bool)
     self.wfreq = Counter()
     self.ti = TermIndex('')
     self.bows_by_wid = deft(list)
     self.wids_by_bow = deft(list)
     self.assignments = deft(list)
     self.bowid_by_docid = dict([])
     self.verbose = verbose
     self.log = sys.stderr
     if self.verbose: self.log.write('%s\n' % str(self))
Example #3
0
 def clear(self):
     self.bows = []
     self.window_by_wid = deft(set)
     self.is_feature = deft(bool)
     self.wfreq = Counter()
     self.ti = TermIndex('')
     self.bows_by_wid = deft(list)
     self.wids_by_bow = deft(list)
     self.assignments = deft(list)
     self.bowid_by_docid = dict([])
 def __init__(self, OUT, TMP, CONFIDENCE):
     self.src = OUT
     self.temp = '%s.smoothing.temp' % '.'.join(TMP.split('.')[:-2])
     self.confidence = CONFIDENCE
     self.posterior = Counter()
     self.prior = Counter()
     self.i = 0
     self.minimum = 0
     self.head = dict([])
     self.grams_by_line = deft(set)
     self.smoothed = deft(bool)
     self.regexs = dict([])
Example #5
0
def make_inverted_index(mat):
    docids_by_dimid = deft(set)
    for docid, v in enumerate(mat):
        for dimid, w in enumerate(v):
            if w:
                docids_by_dimid[dimid].add(docid)
    return docids_by_dimid
Example #6
0
def unigram_frequencies(preprocessor, WORK, MAX_F, _n):

    freqDist = Counter()
    print 'Collecting unigram frequencies...'
    ndocs = 0
    for line in Streamer(WORK, n=_n):
        freqDist.update(set(preprocessor(line).split()))
        ndocs += 1

    freqBand = FrequencyBand(freqDist)

    print 'Determining frequency thresholds...'
    if isinstance(MAX_F, float):
        maxfreq = int(ndocs * MAX_F)
    elif isinstance(MAX_F, int):
        maxfreq = MAX_F
    else:
        maxfreq = freqBand.max_f()

    unigram_f = deft(bool)
    most_freq = freqDist.most_common()
    for i, (w, f) in enumerate(most_freq):
        #    the system ignore the top k
        #    most frequent words.
        if f < maxfreq:                 #    the system stores a True value for all
            unigram_f[w] = True         #    words below the maximum frequency (these
                                   	    #    words constitute relevant, informative
                                        #    content.
    print 'Done!'
    print freqBand
    print '%d words below maxfreq' % (
        len([w for w, boolean in unigram_f.items() if boolean])
    )
    return unigram_f, freqBand
 def __init__(
     self,
     streamer,
     nn=[1, 2, 3]
 ):
     self.nn = nn
     self.streamer = streamer
     self.words = TermIndex('words')
     self.lefts = deft(Counter)
     self.rights = deft(Counter)
     self.orders = dict([])
     for n in self.nn:
         self.orders[n] = deft(Counter)
         self.orders[-n] = deft(Counter)
     self.mass = deft(float)
     self.priors = Counter()
 def __init__(
     self,
     stream,
     ngrams=[
         (1, False),
         (2, False),
         (3, False),
         (4, False),
         (5, False),
         (3, True),
         (4, True),
         (5, True)
     ],
     min_f=2,
     ratio_within=0.8,
     transversal_filter=deft(bool),
     wrapping_filter=deft(bool)
 ):
     self.min_f = min_f
     self.ratio_within = ratio_within
     self.words = TermIndex('')
     self.ngrams = ngrams
     self.stream = stream
     self.priors = Counter()
     self.mass = 0.0
     self.posteriors = deft(Counter)
     self.masses = deft(float)
     self.skipped = deft(Counter)
     self.skippers = deft(Counter)
     self.transversal_filter = transversal_filter
     self.wrapping_filter = wrapping_filter
     self.rewrites = deft(tuple)
     self.relations = Counter()
     self.relaheads = Counter()
 def __init__(self, ngrams=[], ch_ngrams=[], skip_ngrams=[], lang='en'):
     self.lang = lang
     self._ngrams = ngrams
     self._ch_ngrams = ch_ngrams
     self._skip_ngrams = skip_ngrams
     self.stopwords = deft(bool)
     for w in STOPWORDS[self.lang]:
         if w not in EXCEPTIONS[self.lang]:
             self.stopwords[w] = True
 def __inside_extract(self):
     for n in [5, 4, 3]:
         processed = deft(bool)
         posteriors = [
             (gids, freq) for gids, freq in self.posteriors[(n, False)].items()
             if freq > 4
         ]
         for gids, freq in tqdm(posteriors):
             self.__extract_relations(processed, n, gids, freq)
 def __init__(self, max_df=20000):
     self.df = Counter()
     self.tf = dict([])
     self.docs = 0.0
     self.mass = 0.0
     self.dmass = deft(float)
     self.word_by_id = dict([])
     self.id_by_word = dict([])
     self.max_df = max_df
Example #12
0
 def __count(self, X, features_to_deduct):
     F = Counter()
     I = deft(list)
     for i, x in enumerate(X):
         all_features = set(x)
         new_features = all_features - features_to_deduct
         for f in new_features:
             F[f] += 1
             I[f].append(i)
     return F, I
Example #13
0
 def __count(self, X, features_to_deduct):
     F = Counter()
     I = deft(list)
     for i, x in enumerate(X):
         all_features = set(x)
         new_features = all_features - features_to_deduct
         for f in new_features:
             F[f] += 1
             I[f].append(i)
     return F, I
Example #14
0
 def __init__(self):
     self.sentiment = deft(float)
     with open(SENTIWORDNET, 'rb') as rd:
         for l in rd:
             if not RECORD.match(l):
                 continue
             row = l.decode('utf-8').strip().split('\t')
             w = row[4].split('#')[0]
             pos = float(row[2])
             neg = float(row[3])
             self.sentiment[w] = pos - neg
 def __init__(
     self,
     streamer,
     window=5,
     n=1000,
     prune_at=5000
 ):
     self.streamer = streamer
     self.window = window
     self.n = n
     self.prune_at = prune_at
     self.words = TermIndex('')
     self.features = TermIndex('')
     self.mass = 0.0
     self.cc_by_w = deft(list)
     self.qq_by_w = dict([])
     self.post_mass = deft(float)
     self.priors = Counter()
     self.ww_by_f = deft(set)
     self.ff_by_w = deft(set)
     self.types = deft(str)
    def update(self, id_concept, vector):

        if len(vector) < 20:
            return

        if not self.seen[id_concept]:
            self.vectors[id_concept] = deft(bool)
            self.seen[id_concept] = True
        _vector = self.vectors[id_concept]
        
        for word, freq in vector:
            _id = self.words(word)
            _vector[_id] += freq
 def __call__(self, bow):
     hits = Counter()
     matched = deft(list)
     for w in set(bow):
         for i in self.aa_by_w[w]:
             if not self.invprob:
                 hits[i] += 1
             else:
                 hits[i] += self.invprob[w]
             matched[i].append(w)
     return [
         (f, i, self.labels[i], matched[i])
         for i, f in hits.most_common(10)
         if f
     ]
    def __group_records(self, records):
        answers = deft(list)
        counts = Counter()
        for fields in records:
            score = int(fields[-1])
            topic = fields[3]
            question = fields[1]
            answer = fields[-2]

            if counts[question] == 2:
                continue

            if score:
                answers[question].append(answer)
                counts[question] += 1

        return answers
Example #19
0
 def __init__(self,
              min_size=1,
              max_size=1,
              max_freq=0.5,
              min_freq=2,
              sorted=True,
              format='yx',
              itemtext=False):
     self.min_size = min_size
     self.max_size = max_size
     self.min_freq = min_freq
     self.max_freq = max_freq
     self.sorted = sorted
     self.format = format
     self.itemtext = itemtext
     self.cross_cluster_penalty = dict([])
     self.is_feature = deft(bool)
Example #20
0
 def __sort(self):
     out = []
     scores = deft(set)
     for key, value in self.assignments.items():
         scores[value[0]].add(key)
         feature_set = tuple(
             [self.ti[feature] for feature in cat_dedup(tuple(value))]
         )
         triple = (feature_set, key, value)
         out.append(triple)
     if not scores.values():
         upperbound = 0
     else:
         upperbound = max([len(x) for x in scores.values()])
     out.sort(key=lambda x: (upperbound - len(scores[x[-1][0]]), x[0]))
     remainder = self.__get_remainder(set(self.assignments.keys()))
     return [(len(scores[z[0]]), x, self.bowid_by_docid[y]) for x, y, z in out] +\
             remainder
Example #21
0
 def __init__(
     self,
     min_size=1,
     max_size=1,
     max_freq=0.5,
     min_freq=2,
     sorted=True,
     format='yx',
     itemtext=False
 ):
     self.min_size = min_size
     self.max_size = max_size
     self.min_freq = min_freq
     self.max_freq = max_freq
     self.sorted = sorted
     self.format = format
     self.itemtext = itemtext
     self.cross_cluster_penalty = dict([])
     self.is_feature = deft(bool)
Example #22
0
 def __update_cross_cluster_penalties(self, clusters):
     if self.cross_cluster_penalty.keys() \
     and random.random() >= 0.1:
         return
     n = 100
     V = set([])
     penalties = deft(set)
     if len(clusters) > n:
         _clusters = random.sample(clusters, n)
     else:
         _clusters = clusters
         n = len(clusters)
     for i, (_, _, X) in enumerate(_clusters):
         for x in X:
             for w in x:
                 penalties[w].add(i)
     self.cross_cluster_penalty = {
         feat: len(feat_clusters) / n
         for feat, feat_clusters in penalties.items()
     }
Example #23
0
 def __to_out(self, rX, X, clusters):
     positions_by_element = deft(set)
     for i, x in enumerate(X):
         positions_by_element[tuple(x)].add(i)
     original_by_position = {i: x for i, x in enumerate(rX)}
     out = self.__flatten_and_add_initial_positions(clusters,
                                                    positions_by_element,
                                                    original_by_position)
     if not self.sorted:
         out.sort(key=lambda x: x[1])
     if self.format == 'x':
         return [e for _, _, e in out]
     elif self.format == 'y':
         return [h for h, _, _ in out]
     elif self.format == 'xy':
         return [(e, h) for h, _, e in out]
     elif self.format == 'yx':
         return [(h, e) for h, _, e in out]
     else:
         exit('FATAL: unrecognized argument for \'format\' parameter.')
    def __init__(
        self,
        dimensions=100,
        window=5,
        min_count=1,
        workers=1,
#         workers=cpu_count(),
        max_vocab_size=20000000,
        iter=5,
        sg=0
    ):
        self.dimensions = dimensions
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.max_vocab_size = max_vocab_size
        self.iter = iter
        self.sg = sg
        self.model = None
        self.observed = deft(bool)
Example #25
0
 def __to_out(self, rX, X, clusters):
     positions_by_element = deft(set)
     for i, x in enumerate(X):
         positions_by_element[tuple(x)].add(i)
     original_by_position = {i: x for i, x in enumerate(rX)}
     out = self.__flatten_and_add_initial_positions(
         clusters, positions_by_element, original_by_position
     )
     if not self.sorted:
         out.sort(key=lambda x: x[1])
     if self.format == 'x':
         return [e for _, _, e in out]
     elif self.format == 'y':
         return [h for h, _, _ in out]
     elif self.format == 'xy':
         return [(e, h) for h, _, e in out]
     elif self.format == 'yx':
         return [(h, e) for h, _, e in out]
     else:
         exit('FATAL: unrecognized argument for \'format\' parameter.')
Example #26
0
 def __update_cross_cluster_penalties(self, clusters):
     if self.cross_cluster_penalty.keys() \
     and random.random() >= 0.1:
         return
     n = 100
     V = set([])
     penalties = deft(set)
     if len(clusters) > n:
         _clusters = random.sample(clusters, n)
     else:
         _clusters = clusters
         n = len(clusters)
     for i, (_, _, X) in enumerate(_clusters):
         for x in X:
             for w in x:
                 penalties[w].add(i)
     self.cross_cluster_penalty = {
         feat: len(feat_clusters) / n
         for feat, feat_clusters in penalties.items()
     }
    def extract(self, merge=True):
        nn = sorted([n for n, skip in self.ngrams if not skip], reverse=True)
        snn = sorted([n for n, skip in self.ngrams if skip], reverse=True)
        covered = deft(bool)
        accepted = []
        for n in [5, 4, 3, 2]:
            if n <= 1:
                break
            for gids, freq in tqdm(self.posteriors[(n, False)].items()):

                if freq < self.min_f:
                    continue

                if not self.__wrapping_filter(gids):
                    continue

                pxy = freq / self.masses[n]
                fx = self.posteriors[(n - 1, False)][gids[:n - 1]]
                fy = self.posteriors[(n - 1, False)][gids[-(n - 1):]]
                px = fx / self.masses[n - 1]
                py = fy / self.masses[n - 1]
                px_y = px * py
                
                gxy = [self.words[i] for i in gids]
                gx = [self.words[i] for i in gids[:n - 1]]
                gy = [self.words[i] for i in gids[-(n - 1):]]
                
                if covered[gids]:
                    continue

                if (pxy >= px * self.ratio_within and pxy >= py * self.ratio_within):
                    _n = n - 1
                    self.rewrites[tuple(gxy)] = tuple(gxy)
                    while _n:
                        _gids = gids[:_n]
                        gids_ = gids[_n:]
                        covered[_gids] = True
                        covered[gids_] = True
                        _n -= 1

        self.__inside_extract()
Example #28
0
    def __init__(self, freqDist):
        self.bands = deft(set)
        for word, freq in freqDist.items():
            self.bands[freq].add(word)
        for i, band in enumerate(sorted(self.bands.keys(), reverse=True)):
            l = len(self.bands[band])
#             print i, band, len(self.bands[band])
          #		metric 1
#             if i >= band:
#                 self.k = i
#                 self.f = band
#                 #break
          #		metric 2
#             if len(self.bands[band]) >= band:
#                 self.k = i
#                 self.f = band
#                 break
          #		metric 3: squared frequency depth
            if l >= band:
#                 self.k = l
                self.f = i
                break
 def __init__(self, categories_by_concept, terms,
              categories, tfidf, max_depth=5, min_df=20
 ):
     self.min_df = min_df
     self.path_categories_by_concept = categories_by_concept
     self.path_categories = categories
     self.path_terms = terms
     self.max_depth = max_depth
     self.observed_category = deft(bool)
     self.id_by_concept = dict([])
     self.concept_by_id = dict([])
     self.term_is_category = deft(bool)
     self.parents_by_category = dict([])
     self.parents_by_concept = deft(list)
     self.id_by_term = dict([])
     self.term_by_id = dict([])
     self.has_parents = deft(bool)
     self.tfidf = tfidf
     self.pulling = set([])
     self.vector_by_category = deft(Counter)
     self.contributors_by_category = deft(set)
     self.projected = Counter()
            for w, f in freqs.items():
                vector[w] += f
        return {w: f for w, f in vector.items() if f > 2}
    
#     def __remove_header(self, section):
#         return section


if __name__ == '__main__':
    
    streamer = Streamer()
    
    dist = Distribution('vector')
    category_index = TermIndex('category')
    variant_index = TermIndex('term')
    variants_by_concept = deft(list)
    categories_by_concept = deft(list)

    start = time.time()
    for i, article in enumerate(streamer.articles()):

#         if i <= 380000:
#             if not i % 1000:
#                 print '...', i
#             continue

        concept = article['concept']
        variant = article['variant']
        categories = article['categories']

        id_concept = variant_index(concept)
 def __init__(self, invprob):
     self.aa_by_w = deft(list)
     self.labels = []
     self.n = 0
     self.invprob = invprob
Example #32
0
import nltk

from nltk.corpus import wordnet as wn

from collections import defaultdict as deft


EXCLUDED = deft(bool)
for w in [
    'entity.n.01', 'abstraction.n.06', 'physical_entity.n.01',
    'psychological_feature.n.01', 'event.n.01', 'abstraction.n.06',
    'state.n.02', 'communication.n.02', 'act.n.02', 'whole.n.02',
    'content.n.05', 'message.n.02', 'group.n.01', 'quality.n.01',
    'attribute.n.02', 'relation.n.01', 'cognition.n.01', 'instrumentality.n.03',
    'object.n.01', 'location.n.01', 'fundamental_quantity.n.01', 'measure.n.02',
    'artifact.n.01', 'cognition.n.01', 'property.n.02', 'relation.n.01',
    'vertebrate.n.01', 'medium_of_exchange.n.01', 'legality.n.01', 'change.n.03',
    'fraction.n.03', 'process.n.06', 'action.n.01', 'definite_quantity.n.01',
    'chordate.n.01', 'representational_process.n.01', 'sidereal_day.n.01',
    'ideal.n.01', 'repeat.n.01', 'one.n.01', 'grammatical_category.n.01',
    'part.n.03', 'psychological_state.n.01', 'assignment.n.01',
    'out.n.01', 'kind.n.01', 'type.n.06'
]:
    EXCLUDED[w] = True



class WordNet:
    
    def __init__(self):
 def __init__(self, name):
     self.name = name
     self.words = TermIndex('vector.term')
     self.vectors = dict([])
     self.seen = deft(bool)
        elif not alnum:
            add_token(tokens, token, alnum)
            token = ''
        else:
            add_token(tokens, token, alnum)
            token = char
    if token:
        add_token(tokens, token, alnum)
    return tokens


def normalize(string):
    return unicode2ascii(string.strip().lower())


stopwords = deft(bool)
for w in STOPWORDS['en']:
    stopwords[w] = True


def words(text):
    tokens = [w for w in tokenizer(text.lower()) if w != 'PUNCT']
#     return tokens
    return [w for w in tokens if not stopwords[w]]


def f1(prec, recall):
    return (2 * prec * recall) / (prec + recall)


def average(values):
Example #35
0
from nltk import (
    ngrams,
    sent_tokenize as splitter,
    wordpunct_tokenize as tokenizer
)

from nltk.corpus import stopwords

from nltk.probability import FreqDist


NUM = re.compile('[0-9]')
NON_ALPHA = re.compile('[^a-z]', re.IGNORECASE)

STOPWORDS = deft(bool)
for w in stopwords.words('english'):
    STOPWORDS[w] = True


def decode(string):
    try:
        return string.decode('utf-8')
    except Exception:
        return string


def encode(string):
    try:
        return string.encode('utf-8')
    except Exception: