Beispiel #1
0
 def __init__(self, data_io=(), eager=False):
     self.corpus_parser = CorpusParser()
     self.data_io = data_io
     self.trained = False
     if eager:
         self.train()
         self.trained = True
    def test_blank(self):
        """
        does not allow blank lines from happening
        """
        cp = CorpusParser()

        results = list(cp.parse(self.blank))
        self.assertEqual(0, len(results))
Beispiel #3
0
 def __init__(self, data_io=(), eager=False):
   self.corpus_parser = CorpusParser()
   self.data_io = data_io
   self.trained = False
   if eager:
     self.train()
     self.trained = True
  def test_parse(self):
    """will parse a brown corpus line using the standard / notation"""
    cp = CorpusParser()

    null = CorpusParser.TagWord('START', 'START')
    several = CorpusParser.TagWord('Several', 'ap')
    defendants = CorpusParser.TagWord('defendants', 'nns')
    period = CorpusParser.TagWord('.', '.')

    expectations = [
      [null, several],
      [several, defendants],
      [defendants, period]
    ]

    results = list(cp.parse(self.stream))
    self.assertListEqual(expectations, results)
def splitter(sources):
    res_train, res_test = defaultdict(dict), defaultdict(dict)
    for source in sources:
        # Parse corpus
        parser = CorpusParser(input_dir=source, ldir="./log/")
        parser.parse()
        # Get and transform to DataFrame
        reviews = parser.get_parsed()
        reviews = pd.DataFrame(reviews)
        # Split train/test
        np.random.seed(42)
        msk = np.random.rand(len(reviews)) < 0.8
        train, test = reviews[msk], reviews[~msk]
        # Retransform to dict
        train, test = train.T.to_dict().values(), test.T.to_dict().values()
        # Add to result
        res_train[source] = train
        res_test[source] = test
    return res_train, res_test
    def test_parse(self):
        """
        will parse a brown corpus line using the standard / notation
        """
        cp = CorpusParser()

        null = CorpusParser.TagWord('START', 'START')
        several = CorpusParser.TagWord('Several', 'ap')
        defendants = CorpusParser.TagWord('defendants', 'nns')
        period = CorpusParser.TagWord('.', '.')

        expectations = [[null, several], [several, defendants],
                        [defendants, period]]

        results = list(cp.parse(self.stream))
        self.assertListEqual(expectations, results)
Beispiel #7
0
class POSTagger(object):
    """
  This class is responsible for tagging new data given the corpus training data.
  此類別針對已知語料訓練資料來標記新資料
  """
    class LazyFile(object):
        """
    The class wrapping an iterator of a file object in an iterator, which opens the file when iterated.
    此類別在迭代器中包裝文件對象的迭代器 , 在迭代時打開該文件
    """
        def __init__(self, filename):
            self.filename = filename
            self.file = None

        def __iter__(self):
            self.file = open(self.filename, 'r')
            return self

        def __next__(self):
            try:
                line = next(self.file)
            except StopIteration as e:
                self.file.close()
                raise e
            return line

        def next(self):
            return self.__next__()

    @classmethod
    def from_filepaths(cls, training_files, eager=False):
        """
    Create POSTagger from list of file names
    從文件名列表創建 POSTagger
    :param training_files: list of file names
    :param eager: boolean: train while opening
    :return: POStagger
    """
        lazy_files = [POSTagger.LazyFile(fn) for fn in training_files]
        return POSTagger(lazy_files, eager)

    def __init__(self, data_io=(), eager=False):
        self.corpus_parser = CorpusParser()
        self.data_io = data_io
        self.trained = False
        if eager:
            self.train()
            self.trained = True

    def train(self):
        if not self.trained:
            self.tags = set()
            self.tag_combos = defaultdict(int)
            self.tag_frequencies = defaultdict(int)
            self.word_tag_combos = defaultdict(int)

            for io in self.data_io:
                for line in io:
                    for ngram in self.corpus_parser.parse(line):
                        self.write(ngram)
            self.trained = True

    def write(self, ngram):
        """
    :param ngram:
    """
        if ngram[0].tag == 'START':
            self.tag_frequencies['START'] += 1
            self.word_tag_combos['START/START'] += 1

        self.tags.add(ngram[-1].tag)

        self.tag_frequencies[ngram[-1].tag] += 1
        combo = ngram[-1].word + '/' + ngram[-1].tag
        self.word_tag_combos[combo] += 1
        combo = ngram[0].tag + '/' + ngram[-1].tag
        self.tag_combos[combo] += 1

    def viterbi(self, sentence):
        sentence1 = re.sub(r'([\.\?!])', r' \1', sentence)
        parts = re.split(r'\s+', sentence1)
        last_viterbi = {}
        backpointers = ['START']

        for tag in self.tags:
            if tag == 'START':
                continue
            else:
                probability = self.tag_probability('START', tag) \
                              * self.word_tag_probability(parts[0], tag)

                if probability > 0:
                    last_viterbi[tag] = probability

        if len(last_viterbi) > 0:
            backpointer = max(last_viterbi,
                              key=(lambda key: last_viterbi[key]))
        else:
            backpointer = max(self.tag_frequencies,
                              key=(lambda key: self.tag_frequencies[key]))
        backpointers.append(backpointer)

        for part in parts[1:]:
            viterbi = {}
            for tag in self.tags:
                if tag == 'START':
                    continue
                if len(last_viterbi) == 0:
                    break

                best_tag = max(last_viterbi,
                               key=(lambda prev_tag: last_viterbi[prev_tag] *
                                    self.tag_probability(prev_tag, tag) * self.
                                    word_tag_probability(part, tag)))

                probability = last_viterbi[best_tag] * \
                              self.tag_probability(best_tag, tag) * \
                              self.word_tag_probability(part, tag)

                if probability > 0:
                    viterbi[tag] = probability

            last_viterbi = viterbi

            if len(last_viterbi) > 0:
                backpointer = max(last_viterbi,
                                  key=(lambda key: last_viterbi[key]))
            else:
                backpointer = max(self.tag_frequencies,
                                  key=(lambda key: self.tag_frequencies[key]))
            backpointers.append(backpointer)

        return backpointers

    def tag_probability(self, previous_tag, current_tag):
        """Maximum likelihood estimate 最大概率估算
    count(previous_tag, current_tag) / count(previous_tag)"""
        denom = self.tag_frequencies[previous_tag]

        if denom == 0:
            return 0
        else:
            return self.tag_combos[previous_tag + '/' +
                                   current_tag] / float(denom)

    def word_tag_probability(self, word, tag):
        """Maximum Likelihood estimate 最大概率估算
    count (word and tag) / count(tag)"""
        denom = self.tag_frequencies[tag]
        if denom == 0:
            return 0
        else:
            return self.word_tag_combos[word + '/' + tag] / float(denom)

    def probability_of_word_tag(self, words, tags):
        if len(words) != len(tags):
            raise ValueError('The word and tags must be the same length!')

        length = len(words)

        probability = 1.0

        for i in range(1, length):
            probability *= self.tag_probability(tags[i - 1], tags[i]) * \
                           self.word_tag_probability(words[i], tags[i])

        return probability
import sys
sys.path.append('./src/')
from corpus_parser import CorpusParser
from corpus_attributes import CorpusAttributes
from time import clock

# Pick the corpus to parse
corpus_parser = CorpusParser(sys.argv)

# Parse it
corpus_parser.parse()

# Create the attributes object
corpus_attributes = CorpusAttributes(corpus_parser)

# Get aggregate statistics on attributes over time
corpus_attributes.get_attributes()
corpus_attributes.get_stats(sys.argv[3])
corpus_attributes.print_stats()
  def test_blank(self):
    """does not allow blank lines from happening"""
    cp = CorpusParser()

    results = list(cp.parse(self.blank))
    self.assertEqual(0, len(results))
import sys
sys.path.append('./src/')
from corpus_parser import CorpusParser
from naive_sentiment import NaiveSentiment

# Pick the corpus to parse
corpus_parser = CorpusParser(sys.argv)

# Parse it 
corpus_parser.parse()

# Get stats for the particular corpus
naive_sentiment = NaiveSentiment(corpus_parser)

# Parse articles for content words and print their frequencies

naive_sentiment.get_stats()

naive_sentiment.print_stats()


Beispiel #11
0
from os.path import isfile, join, splitext
from bs4 import BeautifulSoup
import pickle

numTopWords = 500
numTopTopicWords = 20
numTopics = 4
numIterations = 10

# pages directory expected to be in cwd
pagesPath = "./pages/"
pagesFileNames = [f for f in listdir(pagesPath) if isfile(join(pagesPath, f))]
filePaths = [join(pagesPath, fileName) for fileName in pagesFileNames]

# generate spark compatible LDA dataset
parser = CorpusParser(filePaths)
filename = "lda_dataset.txt"
parser.generateLDADataset(filename, numTopWords)

# load spark
sc = SparkContext("local", "LDA")
spark = SparkSession.builder.appName("LDA").getOrCreate()
sc.addFile(filename)
dataset = spark.read.format("libsvm").load(filename)

# train LDA model
lda = LDA()
lda.setK(numTopics).setMaxIter(numIterations)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
Beispiel #12
0
class POSTagger(object):
  """
  This class is responsible for tagging new data given the corpus training data.
  """

  class LazyFile(object):
    """
    The class wrapping an iterator of a file object in an iterator, which opens the file when iterated.
    """
    def __init__(self, filename):
      self.filename = filename
      self.file = None

    def __iter__(self):
      self.file = open(self.filename, 'r')
      return self

    def __next__(self):
      try:
        line = next(self.file)
      except StopIteration as e:
        self.file.close()
        raise e
      return line

    def next(self):
      return self.__next__()

  @classmethod
  def from_filepaths(cls, training_files, eager=False):
    """
    Create POSTagger from list of file names
    :param training_files: list of file names
    :param eager: boolean: train while opening
    :return: POStagger
    """
    lazy_files = [POSTagger.LazyFile(fn) for fn in training_files]
    return POSTagger(lazy_files, eager)

  def __init__(self, data_io=(), eager=False):
    self.corpus_parser = CorpusParser()
    self.data_io = data_io
    self.trained = False
    if eager:
      self.train()
      self.trained = True

  def train(self):
    if not self.trained:
      self.tags = set()
      self.tag_combos = defaultdict(int)
      self.tag_frequencies = defaultdict(int)
      self.word_tag_combos = defaultdict(int)

      for io in self.data_io:
        for line in io:
          for ngram in self.corpus_parser.parse(line):
            self.write(ngram)
      self.trained = True

  def write(self, ngram):
    """

    :param ngram:
    """
    if ngram[0].tag == 'START':
      self.tag_frequencies['START'] += 1
      self.word_tag_combos['START/START'] += 1

    self.tags.add(ngram[-1].tag)

    self.tag_frequencies[ngram[-1].tag] += 1
    combo = ngram[-1].word + '/' + ngram[-1].tag
    self.word_tag_combos[combo] += 1
    combo = ngram[0].tag + '/' + ngram[-1].tag
    self.tag_combos[combo] += 1

  def viterbi(self, sentence):
    sentence1 = re.sub(r'([\.\?!])', r' \1', sentence)
    parts = re.split(r'\s+', sentence1)
    last_viterbi = {}
    backpointers = ['START']

    for tag in self.tags:
      if tag == 'START':
        continue
      else:
        probability = self.tag_probability('START', tag) \
                      * self.word_tag_probability(parts[0], tag)

        if probability > 0:
          last_viterbi[tag] = probability

    if len(last_viterbi) > 0:
      backpointer = max(last_viterbi,
                        key=(lambda key: last_viterbi[key]))
    else:
      backpointer = max(self.tag_frequencies,
                        key=(lambda key: self.tag_frequencies[key]))
    backpointers.append(backpointer)

    for part in parts[1:]:
      viterbi = {}
      for tag in self.tags:
        if tag == 'START':
          continue
        if len(last_viterbi) == 0:
          break

        best_tag = max(last_viterbi,
                       key=(lambda prev_tag: last_viterbi[prev_tag] *
                                             self.tag_probability(prev_tag, tag) *
                                             self.word_tag_probability(part, tag)))

        probability = last_viterbi[best_tag] * \
                      self.tag_probability(best_tag, tag) * \
                      self.word_tag_probability(part, tag)

        if probability > 0:
          viterbi[tag] = probability

      last_viterbi = viterbi

      if len(last_viterbi) > 0:
        backpointer = max(last_viterbi,
                          key=(lambda key: last_viterbi[key]))
      else:
        backpointer = max(self.tag_frequencies,
                          key=(lambda key: self.tag_frequencies[key]))
      backpointers.append(backpointer)

    return backpointers

  def tag_probability(self, previous_tag, current_tag):
    """Maximum likelihood estimate
    count(previous_tag, current_tag) / count(previous_tag)"""
    denom = self.tag_frequencies[previous_tag]

    if denom == 0:
      return 0
    else:
      return self.tag_combos[previous_tag + '/' + current_tag] / float(denom)

  def word_tag_probability(self, word, tag):
    """Maximum Likelihood estimate
    count (word and tag) / count(tag)"""
    denom = self.tag_frequencies[tag]
    if denom == 0:
      return 0
    else:
      return self.word_tag_combos[word + '/' + tag] / float(denom)

  def probability_of_word_tag(self, words, tags):
    if len(words) != len(tags):
      raise ValueError('The word and tags must be the same length!')

    length = len(words)

    probability = 1.0

    for i in range(1, length):
      probability *= self.tag_probability(tags[i - 1], tags[i]) * \
                     self.word_tag_probability(words[i], tags[i])

    return probability
Beispiel #13
0
 def __init__(self, data_io):
     self.corpus_parser = CorpusParser()
     self.data_io = data_io
     self.trained = False
Beispiel #14
0
class POSTagger:
    def __init__(self, data_io):
        self.corpus_parser = CorpusParser()
        self.data_io = data_io
        self.trained = False
    
    def train(self):
        if not self.trained:
            self.tags = set(['Start'])
            self.tag_combos = defaultdict(lambda: 0, {})
            self.tag_frequencies = defaultdict(lambda: 0, {})
            self.word_tag_combos = defaultdict(lambda: 0, {})

            for io in self.data_io:
                for line in io.read_lines():
                    for ngram in self.corpus_parser.parse(line):
                        write(ngram)
        
            self.trained = True
        
    def write(self, ngram):
        if ngram[0].tag == 'START':
            self.tag_frequencies['START'] += 1
            self.word_tag_combos['START/START'] += 1
        
        self.tags.append(ngram[-1])
        self.tag_frequencies[ngram[-1].tag] += 1
        self.word_tag_combos['/'.join([ngram[-1].word, ngram[-1].tag])] += 1
        self.tag_combos["/".join([ngram[0].tag, ngram[-1].tag])] += 1
    
    def tag_probability(previous_tag, current_tag):
        denom = self.tag_frequencies[previous_tag]

        if denom == 0:
            return 0.0
        else:
            return self.tag_combos['/'.join(previous_tag, current_tag)] / float(denom)
    
    def word_tag_probability(word, tag):
        denom = self.tag_frequencies[tag]

        if denom == 0:
            return 0.0
        else:
            self.word_tag_combos["/".join(word, tag)] / float(denom)
    

    def probability_of_word_tag(word_sequence, tag_sequence):
        if len(word_sequence) != len(tag_sequence):
            raise Exception('The word and tags must be the same length...')

        length = len(word_sequence)

        probability = 1.0

        for i in range(1, length):
            probability *= (
                tag_probability(tag_sequence[i - 1], tag_sequence[i]) *
                word_tag_probability(word_sequence[i], tag_sequence[i])
            )
        
        return probability
    
    def viterbi(self, sentence):
        parts = re.sub(r"([\.\?!])", r" \1", sentence)

        last_viterbi = {}
        backpointers = ['START']

        for tag in self.tags:
            if tag == 'START':
                next()
            else:
                probability = tag_probability('START', tag) * \
                              word_tag_probability(parts[0], tag)
                
                if probability > 0:
                    last_viterbi[tag] = probability
            
            backpointers.append(
                max(v for v in last_viterbi.values()) or
                max(v for c in self.tag_frequencies.values())
            )

        for part in parts[1:]:
            viterbi = {}
            for tag in self.tags:
                if tag == 'START':
                    next()
                if last_viterbi:
                    break
                best_previous = max(
                    for ((prev_tag, probability) in last_viterbi.iteritems()):
                    probability * \
                    tag_probability(prev_tag, tag) * \
                    word_tag_probability(part,tag)
                    )
                best_tag = best_previous[0]
                probability = last_viterbi[best_tag] * \
                tag_probability(best_tag, tag) * \
                word_tag_probability(part, tag)
                if probability > 0:
                viterbi[tag] = probability
            last_viterbi = viterbi
            backpointers << (
                max(v for v in last_viterbi.itervalues()) or
                max(v for v in self.tag_frequencies.itervalues())
            )
        
        return backpointers