Esempio n. 1
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename

        self._scores = {}

        re_hashtag = re.compile(r'\#\d+')

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                if '|' in line: prefix, _ = line.split(u'|', 1)
                else: prefix = line.strip()
                cols = prefix.strip().split()

                categories = {}
                for i, tok in enumerate(cols):
                    if i == 0: w = re_hashtag.sub(u'', cols[0]).lower()
                    elif i == 1: continue
                    else: categories[tok.lower().strip('*')] = 1.0

                if stemmed: w = porter_stem(w)
                if w not in self._scores: self._scores[w] = categories
                else: self._scores[w].update(categories)

        return u'Read {} items from {} (<{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))
Esempio n. 2
0
    def _analyze(self, word):
        stem = porter_stem(word)
        lemma = list(self.analyzer.analyze(
            [[word]]))[0][0][1].split('||')[0].split('<')[0]

        cand_krs = self.morph_analyzer.analyze([[word]]).next().next()
        candidates = [cand.split('||')[0].split('<')[0] for cand in cand_krs]

        self.cache[word] = (stem, lemma, candidates)
Esempio n. 3
0
    def _analyze(self, word):
        stem = porter_stem(word)
        lemma = list(self.analyzer.analyze(
            [[word]]))[0][0][1].split('||')[0].split('<')[0]

        cand_krs = self.morph_analyzer.analyze([[word]]).next().next()
        candidates = [cand.split('||')[0].split('<')[0] for cand in cand_krs]

        self.cache[word] = (stem, lemma, candidates)
Esempio n. 4
0
def remove_stems(text, output_file):
  dictionary_dir = os.path.join(dir, 'dictionaries/')
  md = defaultdict(list)
  for word in text:
    word = word.strip()
    md[porter_stem(word)].append(word)
  with open(os.path.join(dictionary_dir, output_file), 'wb') as output:
    for k,v in md.iteritems():
      output.write(v[0] + "\n")
      if len(v) > 1:
        print v
Esempio n. 5
0
def remove_stems_from_file():
  dir = os.getcwd()
  dictionary_dir = os.path.join(dir, 'dictionaries/')
  md = defaultdict(list)
  with open(os.path.join(dictionary_dir, 'temp-extend-technology')) as f:
    for word in f:
      word = word.strip()
      md[porter_stem(word)].append(word)
  with open(os.path.join(dictionary_dir, 'temp-extend-technology-unique'), 'wb') as output:
    for k,v in md.iteritems():
      output.write(v[0] + "\n")
      if len(v) > 1:
        print v
Esempio n. 6
0
    def load_lexicon(self, filenames=None, stemmed=False):
        if filenames is None: filenames = self.filenames

        self._scores = {}

        for (score, filename) in zip((1.0, -1.0), filenames):
            with codecs.open(filename, 'r', 'utf-8') as f:
                for line in f:
                    w = line.strip()
                    if stemmed: w = porter_stem(w)
                    if w not in self._scores: self._scores[w] = {'+ve': score}

        return u'Read {} items from {} (<{}>, <{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filenames[0], _parentdir),
            os.path.relpath(filenames[1], _parentdir))
Esempio n. 7
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename
        self._embeddings = {}
        self.dimensions = 50

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                cols = line.split(u'\t')
                assert len(cols) == self.dimensions + 1
                w = porter_stem(cols[0]) if stemmed else cols[0]
                self._embeddings[w] = np.array(cols[1:])

        self._unk_embedding = self._embeddings['<unk>']

        return u'Read {} embeddings from {} (<{}>).'.format(
            len(self._embeddings), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))
Esempio n. 8
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename

        self._scores = {}

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                w, c, s = self._parse_line(line)
                s = float(s)
                if s == 0.0: continue
                if stemmed: w = porter_stem(w)

                if w not in self._scores: self._scores[w] = {}
                self._scores[w][c] = s

        return u'Read {} items from {} (<{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))
Esempio n. 9
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename

        self._scores = {}

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                id_, eff, words, _ = line.strip().split(u'\t', 3)
                if eff == 'Null': continue
                eff = eff[0] + 'eff'

                for w in words.split(u','):
                    w = w.replace(u'_', u' ')
                    if stemmed: w = porter_stem(w)
                    if w not in self._scores: self._scores[w] = {}
                    self._scores[w][eff] = 1.0

        return u'Read {} items from {} (<{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))
Esempio n. 10
0
  def __init__(self, filename=None, stemmed=False):
    if filename is None: filename = self.filename

    cluster_string_to_id = {}
    clusters_count = 0
    self._clusters = {}

    with codecs.open(filename, 'r', 'utf-8') as f:
      for line in f:
        try:
          c, w, _ = line.split(u'\t', 2)
          if c not in cluster_string_to_id:
            clusters_count += 1
            cluster_string_to_id[c] = clusters_count
          
          if stemmed: w = porter_stem(w)
          self._clusters[w] = cluster_string_to_id[c]
        except ValueError: pass
      
    

    logger.debug('Read {} words and {} clusters for {} (<{}>).'.format(len(self._clusters), clusters_count, self.__class__.__name__, os.path.relpath(filename, _parentdir)))
Esempio n. 11
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename

        self._scores = {}

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                m = self.re_line.match(line)
                if m is None:
                    raise Exception(
                        'Unable to parse line in MPQASubjectivityLexicon: {}'.
                        format(line.strip()))

                prior = m.group(6)
                if prior == 'neutral': continue

                w = m.group(3)
                if stemmed: w = porter_stem(w)

                strength = m.group(1)
                score = 1.0 if strength == 'strongsubj' else 0.5

                if w not in self._scores: self._scores[w] = {}
                if prior == 'both':
                    self._scores[w]['+ve'] = score
                    self._scores[w]['-ve'] = score
                elif prior == 'positive':
                    self._scores[w]['+ve'] = score
                elif prior == 'negative':
                    self._scores[w]['-ve'] = score
                elif prior == 'weakpos':
                    self._scores[w]['+ve'] = score * 0.5
                elif prior == 'weakneg':
                    self._scores[w]['-ve'] = score * 0.5

        return u'Read {} items from {} (<{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))
Esempio n. 12
0
    def load_lexicon(self, filename=None, stemmed=False):
        if filename is None: filename = self.filename

        self._scores = {}
        re_sense = re.compile(r'\#\d+$')

        with codecs.open(filename, 'r', 'utf-8') as f:
            for line in f:
                if line.startswith('#'): continue
                pos, id_, pos_score, neg_score, synset, _ = line.split(
                    u'\t', 5)
                if not id_: continue
                for w in synset.split():
                    w = re_sense.sub('', w)
                    if stemmed: w = porter_stem(w)
                    self._scores[w] = {
                        'pos': float(pos_score),
                        'neg': float(neg_score)
                    }

        return u'Read {} items from {} (<{}>).'.format(
            len(self._scores), self.__class__.__name__,
            os.path.relpath(filename, _parentdir))