Esempio n. 1
0
def tag(text):
    """Tags the input text.

    Arguments:
        text (str): The text to tag.

    Returns:
        ([[(str, str)]]): List of sentences containing lists of word/tag pairs.
    """
    #Separate the input text into sentences
    sentences = nltk.sent_tokenize(str(text))

    #Separate each sentence into words
    nested = []
    for sentence in sentences:
        nested.append(nltk.word_tokenize(sentence))

    # Prepare default tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)  # Same tagger as using nltk.pos_tag

    # Prepare regex tagger for custom tags
    regexp_tagger = nltk.tag.RegexpTagger([(r'\(', '('),
                                           (r'\)', ')'),
                                           (r'\[', '['),
                                           (r'\]', ']'),
                                           (r'_+', 'None')],
                                          backoff=tagger)

    #Add a part of speech tag to each word
    nested_tagged = []
    for sentence in nested:
        nested_tagged.append([TaggedToken(*x) for x in regexp_tagger.tag(sentence)])

    return nested_tagged
Esempio n. 2
0
 def __init__(self):
     # Initializing TreeBank tokenizer from NLTK
     from nltk.tokenize import TreebankWordTokenizer
     self._tb_tokenizer = TreebankWordTokenizer().tokenize
     # Initializing Punkt Sentence Tokenizer from NLTK
     from nltk import data
     self._sent_detector = data.load('tokenizers/punkt/english.pickle')
Esempio n. 3
0
File: rslp.py Progetto: DrDub/nltk
    def read_rule (self, filename):
        rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
        lines = rules.split("\n")

        lines = [line for line in lines if line != ""]     # remove blank lines
        lines = [line for line in lines if line[0] != "#"]  # remove comments

        # NOTE: a simple but ugly hack to make this parser happy with double '\t's
        lines = [line.replace("\t\t", "\t") for line in lines]

        # parse rules
        rules = []
        for line in lines:
            rule = []
            tokens = line.split("\t")

            # text to be searched for at the end of the string
            rule.append( tokens[0][1:-1] ) # remove quotes

            # minimum stem size to perform the replacement
            rule.append( int(tokens[1]) )

            # text to be replaced into
            rule.append( tokens[2][1:-1] ) # remove quotes

            # exceptions to this rule
            rule.append( [token[1:-1] for token in tokens[3].split(",")] )

            # append to the results
            rules.append(rule)

        return rules
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            print 111
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            print 555
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Esempio n. 5
0
def batch_pos_tag(sentences):
    """
    Use NLTK's currently recommended part of speech tagger to tag the
    given list of sentences, each consisting of a list of tokens.
    """
    tagger = load(_POS_TAGGER)
    return tagger.batch_tag(sentences)
Esempio n. 6
0
 def _split_sentence(self, s):
     '''
     sentence splitter
     '''
     #use French sentence tokenizer from nltk
     pst = data.load("tokenizers/punkt/french.pickle")
     return pst.tokenize(s)
Esempio n. 7
0
    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s,
                                    self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size,
                                    tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break
Esempio n. 8
0
    def meaning_words(self, text):



        # meaning tags nouns and adjective only
        meaning_tags = ['NN', 'NNP', 'NNPS', 'JJ']
        default_tagger = data.load(tag._POS_TAGGER)

        ''' sometimes the nltk tagger is misclassifying some part-of-speech
            such as The that should be a determiner. The duty tagger also helps
            to eliminate common words that are not so important
        '''
        duty = dict()
        [duty.update({w:'x'}) for w in self.common_words]

        enchaned_tagger = tag.UnigramTagger(model=duty, backoff=default_tagger)

        meaning_words = ' '.join([w for w, c in enchaned_tagger.tag(
                                 word_tokenize(text)) if c in
            meaning_tags and (len(w) > 2)])

        '''if no meaning words are found, using this approach then
            return the whole text
        '''
        if not meaning_words:
            return None
        else:
            return meaning_words
Esempio n. 9
0
def run(train, test, language, answer):
    results = {}
    if language == 'English':
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
    elif language == 'Spanish':
        tagger = ut(cess_esp.tagged_sents())
    elif language == 'Catalan':
        tagger  = ut(cess_cat.tagged_sents())

    for lexelt in train:

        train_features, y_train = extract_features(train[lexelt],language,tagger)
        test_features, _ = extract_features(test[lexelt],language,tagger)

        X_train, X_test = vectorize(train_features,test_features)
        X_train_new, X_test_new = feature_selection(X_train, X_test,y_train)
        results[lexelt] = classify(X_train_new, X_test_new,y_train)
    """
    B1.c
    for lexelt in train:
        features = getBestWords(train[lexelt], 30)
        train_features = countFeature(features, train[lexelt])
        _, y_train = extract_features(train[lexelt], language)
        test_features = countFeature(features, test[lexelt])

        X_train, X_test = vectorize(train_features, test_features)
        results[lexelt] = classify(X_train, X_test, y_train)
    B1.c
    """
    A.print_results(results, answer)
Esempio n. 10
0
def test_austen():
  from nltk.data import load
  from nltk.corpus import gutenberg as g
  stok = load('tokenizers/punkt/english.pickle')
  train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
  test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
  test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]

  model1 = AdditiveSmoothing(n=2)
  model1.generate_model(train)
  print 'cross entropy additive smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
  model2 = KnesserNey(n=2)
  model2.generate_model(train)
  print 'cross entropy knesser-ney smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
  model3 = SimpleGoodTuring(n=2)
  model3.generate_model(train)
  print 'cross entropy simple good-turing smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)

  model4 = KatzSmoothing(n=2)
  model4.generate_model(train)
  print 'cross entropy katz smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
Esempio n. 11
0
    def digest(self):
        if self.sentences is not None:
            return

            # Digest the problem into sentences
        tokenizer = data.load("tokenizers/punkt/english.pickle")
        self.sentences = tokenizer.tokenize(self.text.strip())

        # Digest each sentence into words and part-of-speech tags
        if self.sentence_tags is None:
            sentence_tags = []
            all_tags = []
            all_words = []
            for s in self.sentences:
                all_words.append(s)
                tags = pos_tag(word_tokenize(s))
                sentence_tags.append(tags)
                for t in tags:
                    l = len(t[0])
                    if not self.longest_word or self.longest_word < l:
                        self.longest_word = l
                    all_tags.append(t[1])
            self.sentence_tags = sentence_tags
            self.all_tags = uniq(all_tags)
            self.all_words = uniq(all_words)
Esempio n. 12
0
File: util.py Progetto: DrDub/nltk
def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
                     skip_header=True):
    """
    Parse csv file containing tweets and output data a list of (text, label) tuples.

    :param filename: the input csv filename.
    :param label: the label to be appended to each tweet contained in the csv file.
    :param word_tokenizer: the tokenizer instance that will be used to tokenize
        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
        If no word_tokenizer is specified, tweets will not be tokenized.
    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
        sentences.
    :param skip_header: if True, skip the first line of the csv file (which usually
        contains headers).

    :return: a list of (text, label) tuples.
    """
    tweets = []
    if not sent_tokenizer:
        sent_tokenizer = load('tokenizers/punkt/english.pickle')

    # If we use Python3.x we can proceed using the 'rt' flag
    if sys.version_info[0] == 3:
        with codecs.open(filename, 'rt') as csvfile:
            reader = csv.reader(csvfile)
            if skip_header == True:
                next(reader, None) # skip the header
            i = 0
            for tweet_id, text in reader:
                # text = text[1]
                i += 1
                sys.stdout.write('Loaded {0} tweets\r'.format(i))
                # Apply sentence and word tokenizer to text
                if word_tokenizer:
                    tweet = [w for sent in sent_tokenizer.tokenize(text)
                                       for w in word_tokenizer.tokenize(sent)]
                else:
                    tweet = text
                tweets.append((tweet, label))
    # If we use Python2.x we need to handle encoding problems
    elif sys.version_info[0] < 3:
        with codecs.open(filename) as csvfile:
            reader = csv.reader(csvfile)
            if skip_header == True:
                next(reader, None) # skip the header
            i = 0
            for row in reader:
                unicode_row = [x.decode('utf8') for x in row]
                text = unicode_row[1]
                i += 1
                sys.stdout.write('Loaded {0} tweets\r'.format(i))
                # Apply sentence and word tokenizer to text
                if word_tokenizer:
                    tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
                                       for w in word_tokenizer.tokenize(sent)]
                else:
                    tweet = text
                tweets.append((tweet, label))
    print("Loaded {0} tweets".format(i))
    return tweets
Esempio n. 13
0
def sent_tokenize(text):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`).
    """
    tokenizer = load("tokenizers/punkt/english.pickle")
    return tokenizer.tokenize(text)
    def load(self, loc):
        '''
        :param loc: Load a pickled model at location.
        :type loc: str 
        '''

        self.model.weights, self.tagdict, self.classes = load(loc)
        self.model.classes = self.classes
Esempio n. 15
0
  def __init__(self, encoding):
    """Constructor.
    """

    super(FrenchBonsaiTokenizer, self).__init__()

    self._sentence_tokenizer = data.load('tokenizers/punkt/french.pickle')
    self._encoding = encoding
Esempio n. 16
0
    def __init__(self):
        """

        :param train_percent_size: 0-1
        :return:
        """
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        self._tagger = load(_POS_TAGGER)
Esempio n. 17
0
def solve_problem(problem):
	tokenizer = load("tokenizers/punkt/english.pickle")
	sentences = tokenizer.tokenize(problem.strip())

	print "Problem input: {0}".format(problem)

	for s in get_statements(sentences):
		print "Statement: {0}".format(str(s))
		print "Solution: {0}".format(s.solve())
Esempio n. 18
0
File: util.py Progetto: DrDub/nltk
def load_parser(grammar_url, trace=0,
                parser=None, chart_class=None,
                beam_size=0, **load_args):
    """
    Load a grammar from a file, and build a parser based on that grammar.
    The parser depends on the grammar format, and might also depend
    on properties of the grammar itself.

    The following grammar formats are currently supported:
      - ``'cfg'``  (CFGs: ``CFG``)
      - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
      - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)

    :type grammar_url: str
    :param grammar_url: A URL specifying where the grammar is located.
        The default protocol is ``"nltk:"``, which searches for the file
        in the the NLTK data package.
    :type trace: int
    :param trace: The level of tracing that should be used when
        parsing a text.  ``0`` will generate no tracing output;
        and higher numbers will produce more verbose tracing output.
    :param parser: The class used for parsing; should be ``ChartParser``
        or a subclass.
        If None, the class depends on the grammar format.
    :param chart_class: The class used for storing the chart;
        should be ``Chart`` or a subclass.
        Only used for CFGs and feature CFGs.
        If None, the chart class depends on the grammar format.
    :type beam_size: int
    :param beam_size: The maximum length for the parser's edge queue.
        Only used for probabilistic CFGs.
    :param load_args: Keyword parameters used when loading the grammar.
        See ``data.load`` for more information.
    """
    grammar = load(grammar_url, **load_args)
    if not isinstance(grammar, CFG):
        raise ValueError("The grammar must be a CFG, "
                         "or a subclass thereof.")
    if isinstance(grammar, PCFG):
        if parser is None:
            parser = InsideChartParser
        return parser(grammar, trace=trace, beam_size=beam_size)

    elif isinstance(grammar, FeatureGrammar):
        if parser is None:
            parser = FeatureChartParser
        if chart_class is None:
            chart_class = FeatureChart
        return parser(grammar, trace=trace, chart_class=chart_class)

    else: # Plain CFG.
        if parser is None:
            parser = ChartParser
        if chart_class is None:
            chart_class = Chart
        return parser(grammar, trace=trace, chart_class=chart_class)
Esempio n. 19
0
def split_sentences(corpus='rbc.txt', newfile='rbc_se.txt'):
    t = load('tokenizers/punkt/russian.pickle')
    text = open('.\\crawler\\' + corpus, 'r', encoding='utf-8')
    new = open(newfile, 'w', encoding='utf-8')
    for line in text:
        s = t.tokenize(line.strip('\n'))
        for sent in s:
            new.write(sent + '\n')
    text.close()
    new.close()
Esempio n. 20
0
def treebank_tokenizer(sentence):
    tokenizer = load('data/german.pickle')
    treebank_word_tokenize = TreebankWordTokenizer().tokenize
    tokens = []
    for s in tokenizer.tokenize(sentence):
        tokens.extend([token for token in treebank_word_tokenize(s)])
    tokens = [
        ''.join(i for i in s if i not in string.punctuation) for s in tokens
    ]
    tokens = list(filter(None, tokens))
    return tokens
Esempio n. 21
0
def ne_chunk_sents(tagged_sentences, binary=False):
    """
    Use NLTK's currently recommended named entity chunker to chunk the
    given list of tagged sentences, each consisting of a list of tagged tokens.
    """
    if binary:
        chunker_pickle = _BINARY_NE_CHUNKER
    else:
        chunker_pickle = _MULTICLASS_NE_CHUNKER
    chunker = load(chunker_pickle)
    return chunker.parse_sents(tagged_sentences)
Esempio n. 22
0
def ne_chunk(tagged_tokens, binary=False):
    """
    Use NLTK's currently recommended named entity chunker to
    chunk the given list of tagged tokens.
    """
    if binary:
        chunker_pickle = _BINARY_NE_CHUNKER
    else:
        chunker_pickle = _MULTICLASS_NE_CHUNKER
    chunker = load(chunker_pickle)
    return chunker.parse(tagged_tokens)
Esempio n. 23
0
    def update_attributes(self, settingfile_input):
        searchURL = self.http + "/search"
        feature_service = "Feature Service"

        query_dict = {'f': 'json',
                      'token': self.token,
                      'q': "tags:\"" + self.utag + "\" AND owner:\"" + self.username + "\" AND type:\"" + feature_service + "\""}

        jsonResponse = sendAGOLReq(searchURL, query_dict)
        if jsonResponse['total'] == 0:
            #feature_id = jsonResponse['results'][0]['id']
            DirMGMT().lgr.error("\n.Couldn't find the service.\n")
            sys.exit()

        else:
            #jsonResponse = sendAGOLReq(searchURL, query_dict)
            feature_id = jsonResponse['results'][0]['id']

        # Update
        updateURL = agol.http + '/content/users/{}/items/{}/update'.format(agol.username, feature_id)

        sentence_break = data.load('tokenizers/punkt/english.pickle')

        temp_desc = ReadSF(settingfile_input).description
        utagloc = temp_desc.find('uTag')
        cut = temp_desc[utagloc:utagloc+42]
        temp_desc = temp_desc.replace(cut, '')
        # TODO remove tags from
        temp_tags = ReadSF(settingfile_input).tags
        # utag = temp_tags.split()[-1]
        # lutag = temp_tags.rfind(utag)-2
        # temp_tags = temp_tags[0:lutag]


        url = updateURL + "?f=json&token=" + agol.token + \
              "&type=Feature Service" \
              "&title=" + agol.serviceName.replace('_', ' ') + \
              "&tags=" + temp_tags + \
              "&snippet=" + sentence_break.tokenize(ReadSF(settingfile_input).description.strip())[0] + \
              "&description=" + temp_desc
              # "&description=" + ReadSF(settingfile_input).description.replace("\n\nuTag: "+ReadSF(settingfile_input).tags[-1], '')

        response = requests.post(url)
        itemPartJSON = json.loads(response.text)

        if "success" in itemPartJSON:
            # itemPartID = itemPartJSON['id']
            itemPartTitle = itemPartJSON['id']
            DirMGMT().lgr.info("updated Feature Layer:   {}".format(itemPartTitle))
            return True
        else:
            DirMGMT().lgr.error("\n.sd file not uploaded. Check the errors and try again.\n")
            DirMGMT().lgr.error(itemPartJSON)
            sys.exit()
Esempio n. 24
0
def ne_chunk(tagged_tokens, binary=False):
    """
    Use NLTK's currently recommended named entity chunker to
    chunk the given list of tagged tokens.
    """
    if binary:
        chunker_pickle = _BINARY_NE_CHUNKER
    else:
        chunker_pickle = _MULTICLASS_NE_CHUNKER
    chunker = load(chunker_pickle)
    return chunker.parse(tagged_tokens)
Esempio n. 25
0
def ne_chunk_sents(tagged_sentences, binary=False):
    """
    Use NLTK's currently recommended named entity chunker to chunk the
    given list of tagged sentences, each consisting of a list of tagged tokens.
    """
    if binary:
        chunker_pickle = _BINARY_NE_CHUNKER
    else:
        chunker_pickle = _MULTICLASS_NE_CHUNKER
    chunker = load(chunker_pickle)
    return chunker.parse_sents(tagged_sentences)
Esempio n. 26
0
def sent_tokenize(text, language="english"):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))
    return tokenizer.tokenize(text)
Esempio n. 27
0
 def __init__(self, language):
     """
     :param str language: ISO 639-1 language code. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
     """
     self.language = language
     model = self.supported_models.get(language)
     if model:
         self.splitter = load(model)
     else:
         raise ValueError(
             "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s"
             % (language, self.supported_models.keys()))
def read_training_data(training_file):
    """
    Extracts part-of-speech (POS) tag, transition between tags, and emission counts from a tagged training corpus.

    The POS tag count keeps track of the number of times a given POS tag occurs in the training data.
    This is stored in a dictionary with POS tag keys and integer count values.

    The transition counts keep track of how often the first tag is followed by a second tag.
    This is stored in a dictionary with tuple(tag1, tag2) keys and the number of times tag2 is followed by tag1 values.

    The emission count keeps track of the number of times a word and its associated tag occurs in the data.
    This is stored in a dictionary with tuple(word, POS tag) keys and integer count values.

    The training file is expected to be a training set of POS-tagged sentences, separated by newline characters.
    Additional custom tags, "START" and "END", are included to indicate the start and end of each sentence.

    :param training_file: the location of the training file
    :return: a tuple of dictionaries tracking tag counts, transition counts, and emission counts
    """
    tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys()) + [
        "START", "END", "-LRB-", "-RRB-", "#"
    ]
    tag_types = [x for x in tag_types if x not in ["(", ")", "--"]
                 ]  # The tagset in nltk uses different notations
    tag_type_permutations = list(product(tag_types, repeat=2))

    tag_counts = dict.fromkeys(tag_types, 0)
    transition_counts = dict.fromkeys(tag_type_permutations, 0)
    emission_counts = {}

    with open(training_file, "r") as training_data:
        for line in tqdm(training_data,
                         total=rawcount(training_file),
                         desc="Training"):

            tagged_tokens = tuple(
                str2tuple(tagged_token) for tagged_token in line.split())
            tag_sequence = ("START", ) + tuple(
                tagged_token[1] for tagged_token in tagged_tokens) + ("END", )

            for tag in tag_sequence:
                tag_counts[tag] += 1

            for tag_pair in pairwise(tag_sequence):
                transition_counts[tag_pair] += 1

            for tagged_token in tagged_tokens:
                if tagged_token in emission_counts:
                    emission_counts[tagged_token] += 1
                else:
                    emission_counts[tagged_token] = 1

    return tag_counts, transition_counts, emission_counts
Esempio n. 29
0
def sent_tokenize(text, language='english'):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
    return tokenizer.tokenize(text)
Esempio n. 30
0
 def __init__(self, language):
     """
     :param str language: ISO 639-1 language code. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
     """
     self.language = language
     model = self.supported_models.get(language)
     if model:
         self.splitter = load(model)
     else:
         raise ValueError(
             "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                 language, self.supported_models.keys()))
Esempio n. 31
0
def _load_universal_map(fileid):
    mapping = {}
    contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
    for line in contents.splitlines():
        line = line.strip()
        if line == '':
            continue
        fine, coarse = line.split('\t')

        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
        assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)

        _MAPPINGS[fileid]['universal'][fine] = coarse
Esempio n. 32
0
def _load_universal_map(fileid):
    mapping = {}
    contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
    for line in contents.splitlines():
        line = line.strip()
        if line == '':
            continue
        fine, coarse = line.split('\t')

        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
        assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)

        _MAPPINGS[fileid]['universal'][fine] = coarse
Esempio n. 33
0
    def __init__(self):

        self.GRAPHENE_SERVICE = "http://nietzsche.fim.uni-passau.de:8080/simplification/text"
        self.premiseIndicators = self.read_key_words(
            "resources/premise_indicator.txt")
        self.claimIndicators = self.read_key_words(
            "resources/claim_indicator.txt")
        self.tagdict = load('help/tagsets/upenn_tagset.pickle')
        self.lb = preprocessing.LabelBinarizer()
        self.lb.fit(list(self.tagdict.keys()))
        self.nlp = spacy.load('en')
        self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(
            'resources/GoogleNews-vectors-negative300.bin.gz', binary=True)
Esempio n. 34
0
def construct_graph(document):
    sentence_detector = data.load('tokenizers/punkt/english.pickle')
    sentences = sentence_detector.tokenize(document)

    nodes = [Node(sentence) for sentence in sentences]
    for idx1 in range(len(nodes)):
        print idx1, len(nodes)
        for idx2 in range(idx1 + 1, len(nodes)):
            node1, node2 = nodes[idx1], nodes[idx2]
            edge_weight = cosine_distance(node1.value, node2.value)
            node1.connect(node2, edge_weight)

    return nodes
Esempio n. 35
0
def pos_freqs(texts):

    tagdict = load('help/tagsets/upenn_tagset.pickle')
    keys = tagdict.keys()
    key_list = list(keys)

    freqs_array = np.zeros((len(texts), len(key_list)), dtype=np.int)
    for i, text in enumerate(texts):
        tags = pos_tagger(text)
        for j, key in enumerate(key_list):
            freqs_array[i, j] = len([tag for tag in tags if tag == key])

    return freqs_array
Esempio n. 36
0
def _format_tagset(tagset, tagpattern=None):
    tagdict = load("help/tagsets/" + tagset + ".pickle")
    if not tagpattern:
        _print_entries(sorted(tagdict), tagdict)
    elif tagpattern in tagdict:
        _print_entries([tagpattern], tagdict)
    else:
        tagpattern = re.compile(tagpattern)
        tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)]
        if tags:
            _print_entries(tags, tagdict)
        else:
            print("No matching tags found.")
Esempio n. 37
0
def _format_tagset(tagset, tagpattern=None):
    tagdict = load("help/tagsets/" + tagset + ".pickle")
    if not tagpattern:
        _print_entries(sorted(tagdict), tagdict)
    elif tagpattern in tagdict:
        _print_entries([tagpattern], tagdict)
    else:
        tagpattern = re.compile(tagpattern)
        tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)]
        if tags:
            _print_entries(tags, tagdict)
        else:
            print("No matching tags found.")
Esempio n. 38
0
File: mapping.py Progetto: xim/nltk
def _load_universal_map(fileid):
    mapping = {}
    contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
    for line in contents.splitlines():
        line = line.strip()
        if line == "":
            continue
        fine, coarse = line.split("\t")

        assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(coarse)
        assert fine not in _MAPPINGS[fileid]["universal"], "Multiple entries for original tag: {}".format(fine)

        _MAPPINGS[fileid]["universal"][fine] = coarse
def srcparse(src):
	tokenizer = load("tokenizers/punkt/english.pickle")
	sentences = tokenizer.tokenize(src.strip().lower())
	bs = compile(r'\d*:\d*')
	rm = compile(r'[*.?!,\'":;\(\)<>]')
	sp = compile(r'[\-\+]')

	starts, joins, ends = [], {}, []

	for sentence in sentences:
		# Format the sentence
		wlist = word_tokenize(bs.sub(" ", sp.sub(" ", rm.sub("",
			sentence.replace("\n", " ")))))

		if len(wlist) < 3:
			# Ignore sentences without triples in the corpus
			continue

		# Add the sentence starting word
		starts.append(wlist[0])

		# Inverse the list so we can build from the ending
		wlist.reverse()

		for i in range(len(wlist) - 2):
			w1 = wlist[i]
			w2 = wlist[i + 1]
			w3 = wlist[i + 2]

			# Handle zero-length breaks in the corpus.
			if 0 in [len(w1), len(w2), len(w3)]:
				continue

			# Generate a list of words which can start a sentence properly
			if i == 0:
				ends.append(w1)

			# Store doubles
			try:
				joins[w1].append(w2)
			except KeyError:
				joins[w1] = [w2]

			# Store triples
			key = (w1, w2)
			try:
				joins[key].append(w3)
			except KeyError:
				joins[key] = [w3]

	return starts, joins, ends
Esempio n. 40
0
def srcparse(src):
    tokenizer = load("tokenizers/punkt/english.pickle")
    sentences = tokenizer.tokenize(src.strip().lower())
    bs = compile(r'\d*:\d*')
    rm = compile(r'[*.?!,\'":;\(\)<>]')
    sp = compile(r'[\-\+]')

    starts, joins, ends = [], {}, []

    for sentence in sentences:
        # Format the sentence
        wlist = word_tokenize(
            bs.sub(" ", sp.sub(" ", rm.sub("", sentence.replace("\n", " ")))))

        if len(wlist) < 3:
            # Ignore sentences without triples in the corpus
            continue

        # Add the sentence starting word
        starts.append(wlist[0])

        # Inverse the list so we can build from the ending
        wlist.reverse()

        for i in range(len(wlist) - 2):
            w1 = wlist[i]
            w2 = wlist[i + 1]
            w3 = wlist[i + 2]

            # Handle zero-length breaks in the corpus.
            if 0 in [len(w1), len(w2), len(w3)]:
                continue

            # Generate a list of words which can start a sentence properly
            if i == 0:
                ends.append(w1)

            # Store doubles
            try:
                joins[w1].append(w2)
            except KeyError:
                joins[w1] = [w2]

            # Store triples
            key = (w1, w2)
            try:
                joins[key].append(w3)
            except KeyError:
                joins[key] = [w3]

    return starts, joins, ends
Esempio n. 41
0
File: map1.py Progetto: fhieber/cdec
def main():
	parser = argparse.ArgumentParser(description="yields uni- and bigrams for pmi models.")
	parser.add_argument('-s', '--stopwords', type=str, help='filter stopwords')
	parser.add_argument('-d', '--digits', action="store_true", default=False, help="remove digits")
	parser.add_argument('-p', '--punctuation', action="store_true", default=False, help="remove punctuation")
	parser.add_argument('-l', '--length', type=int, default=None, help="minimum word length")
	parser.add_argument('-t', '--tags', nargs='+', default=set(), type=str, help="specify forbidden pos tags")
	parser.add_argument('-S', '--sentence-tok', action="store_true", default=False, help="split document into sentences. Co-occurrence boundary is then within sentences")
	parser.add_argument('--lowercase', action="store_true", default=False, help="lowercase input")
	parser.add_argument('--tokenize', action="store_true", default=False, help="tokenize input. necessary for pos-tagging.")
	parser.add_argument('-w', '--window-size', type=int, help="set co-occurence boundary to a window of x terms")
	parser.add_argument('--stemming', action="store_true", default=False, help="perform stemming with the porter2 stemming algorithm")
	parser.add_argument('-u', '--unicode', action="store_true", default=False, help="use Unicode input/output and filter on unicode categories")
	parser.add_argument('--unidecode', action="store_true", default=False, help="convert unicode symbols to ASCII symbols if possible (using Unidecode package)")
	args = parser.parse_args()

	sys.stderr.write(str(args)+"\n") # arg namepace
	fd = sys.stdin
	out = sys.stdout
	if args.unidecode or args.unicode:
		fd = codecs.getreader('utf-8')(sys.stdin)
		out = codecs.getwriter('utf-8')(sys.stdout)

	global pos_tagger
	global sen_tagger
	pos_tagger, sen_tagger = None, None
	excluded = set()
	if args.stopwords:
		excluded |= load_stopwords(args.stopwords)
	if args.punctuation:
		excluded |= punct
	if args.tags:
		pos_tagger = data.load('file:postagger', format="pickle")
	if args.sentence_tok:
		sen_tagger = data.load('file:sentencetokenizer', format="pickle")
	
	for w in yieldWords(fd, args, excluded):
		out.write(w + "\n")
Esempio n. 42
0
    def __init__(self, word_embeddings, seq_length=1000, stopwords='default'):
        """
        Initialises the embedder class.
        Expects a WordEmbeddings object.
        """
        self.embeddings = word_embeddings
        self.MAX_SEQUENCE_LENGTHS = seq_length

        if (stopwords == 'default'):
            self.STOPWORDS = STOPWORDS
        else:
            self.STOPWORDS = stopwords

        self.postags = load('help/tagsets/upenn_tagset.pickle')
Esempio n. 43
0
def get_tags_bow(sentences):
    if os.path.isfile(TAGS_BOW):
        return data.get_pickle(TAGS_BOW)
    else:
        from collections import Counter
        from nltk.data import load
        corpus = list(load('help/tagsets/upenn_tagset.pickle').keys())
        f = lambda x: Counter([y for y in x if y in corpus])
        df = pd.DataFrame({"tags": sentences})
        df["bow"] = (pd.DataFrame(df["tags"].apply(f).values.tolist()).reindex(
            columns=corpus).fillna(0).astype(int).values.tolist())
        result = df["bow"].tolist()
        data.save_pickle(TAGS_BOW, result)
        return result
Esempio n. 44
0
def generate_vocab_pos_upenn():
    # Getting tags from upenn_tagset
    nltk.download('tagsets', quiet=True)
    tagdict = load('help/tagsets/upenn_tagset.pickle')

    # creating dictionary with pos_tags, using negative numbers
    pos_dic = dict(enumerate(list(set(tagdict.keys()))))
    pos_dictionary = {v: -(k + 1) for k, v in pos_dic.items()}

    # with open(pos_vocabulary_pkl , 'wb') as output:
    #     pickle.dump(pos_dictionary, output, pickle.HIGHEST_PROTOCOL)
    #     print("Pos vocabulary saved as pkl")

    return pos_dictionary
def getSentencesFromFiles(dataDir):
    '''Read text files in a directory
       and return a list of all sentences
       Doing this for huge data will be suicidal
    '''
    sentDetector = ntd.load('tokenizers/punkt/english.pickle')
    allSentences=[]
    for folderName, subfolders, filenames in os.walk(dataDir):
        for file in filenames:
            print("Extracting Sentences from file ",file)
            text = open(dataDir+'\\'+file,encoding='utf-8')
            sentence = sentDetector.tokenize(text.read())
            allSentences.extend(sentence)
    return allSentences
Esempio n. 46
0
def prediction_neighbor_with_pos(lines_with_unknown, word2vec_model, NN_model):
    """
    Return the prevision for each unknown word based on the similarities with the nearest neighbours.
    This time, we used the lstm pos model to select the word based on its POS tag.
    :param lines_with_unknown: list of string lines with token words.
    :param word2vec_model: the word2vec model that we used to get similar words.
    :param lstm_model: we use this one to predict the pos tag of the word.
    :return: predicted words.
    """
    predicted_words = []  # List of all the predicted words
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    list_tags = list(tagdict.keys())  # Get the list of all the tags.
    for i in range(len(lines_with_unknown)):
        line = lines_with_unknown[i]
        if 'unk' in line:  # If the line contains the word 'unk'
            index = line.index('unk')
            neighbours_words = [
                line[i] for i in (index - 2, index - 1, index + 1, index + 2)
            ]  # Extract the words around
            most_similar_list = word2vec_model.most_similar(
                positive=neighbours_words)[:10]
            sample = []

            for word in neighbours_words:  # Format the neighbouring words for the Neural Network
                sample.append(one_hot_encoding(word, list_tags).tolist())

            Y_pos = NN_model.predict(np.array(sample).reshape(
                (1, 4, 45)))  # Predict the vector of POS tag
            id_pos = np.argmax(Y_pos)  # Take the id
            pos_tag = list_tags[
                id_pos]  # We got now the POS tag which is predicted, we can get a more accurate prediction

            # We then check if there is a word if the corresponding POS tag among the top 10,

            best_candidate = []
            for i in range(len(most_similar_list)):
                word = most_similar_list[i][0]
                if nltk.pos_tag([word]) == pos_tag:
                    best_candidate.append(word)

            if best_candidate:  # If the list is not empty
                predicted_words.append(
                    best_candidate[0])  # Take the first element
            else:
                predicted_words.append(
                    most_similar_list[0]
                    [0])  # Otherwise we just take the first element

    return predicted_words
Esempio n. 47
0
 def __init__(self):
     current_dir = os.path.dirname(inspect.stack()[0][1])
     parent_dir = current_dir.rsplit('/', 1)[0]
     self.GRAPHENE_SERVICE = "http://nietzsche.fim.uni-passau.de:8080/simplification/text"
     self.premiseIndicators = self.read_key_words(
         (parent_dir + "/resources/premise_indicator.txt"))
     self.claimIndicators = self.read_key_words(
         (parent_dir + "/resources/claim_indicator.txt"))
     self.tagdict = load('help/tagsets/upenn_tagset.pickle')
     self.lb = preprocessing.LabelBinarizer()
     self.lb.fit(list(self.tagdict.keys()))
     self.nlp = spacy.load('en')
     self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(
         (parent_dir + '/resources/GoogleNews-vectors-negative300.bin.gz'),
         binary=True)
Esempio n. 48
0
def _load_universal_map(fileid):
    contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
    _MAPPINGS[fileid]["universal"].default_factory = lambda: "X"

    for line in contents.splitlines():
        line = line.strip()
        if line == "":
            continue
        fine, coarse = line.split("\t")

        assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(
            coarse)
        assert (fine not in _MAPPINGS[fileid]["universal"]
                ), "Multiple entries for original tag: {}".format(fine)

        _MAPPINGS[fileid]["universal"][fine] = coarse
Esempio n. 49
0
def _load_universal_map(fileid):
    contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")

    # When mapping to the Universal Tagset,
    # map unknown inputs to 'X' not 'UNK'
    _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'

    for line in contents.splitlines():
        line = line.strip()
        if line == '':
            continue
        fine, coarse = line.split('\t')

        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
        assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)

        _MAPPINGS[fileid]['universal'][fine] = coarse
Esempio n. 50
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(), self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size,
                                        tagger, self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)
Esempio n. 51
0
def get_subjectivity_analyzer(lang):
    try:
        sa_subj_data_file_path = 'nltk_data/sa_subjectivity.pickle'

        sentim_analyzer = load(DEFAULT_PROJECT_PATH + sa_subj_data_file_path)

    except LookupError:
        my_print(
            '{}Cannot find the sentiment analyzer you want to load.'.format(
                WARNING_FLAG))
        my_print(
            '{}Training & save a new one using NaiveBayesClassifier.'.format(
                WARNING_FLAG))

        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    return sentim_analyzer
Esempio n. 52
0
def POS_tagging(essay):
    """
		Parts of speech tagging done. Net count of each is returned.
	"""
    POS_dict = {}
    for i, j in nltk.pos_tag(essay):
        if j in POS_dict:
            POS_dict[j] += 1
        else:
            POS_dict[j] = 1

    tagdict = load('help/tagsets/upenn_tagset.pickle')
    for i in tagdict:
        if i not in POS_dict:
            POS_dict[i] = 0

    return POS_dict
Esempio n. 53
0
 def __init__(self, filename):
     self.filename = filename
     self.tokenizer = TreebankWordTokenizer()
     self.sent_tokenizer = load(
         'tokenizers/punkt/{0}.pickle'.format('english'))
     self.st = StanfordPOSTagger(
         '../stanfordPOStagger/english-bidirectional-distsim.tagger',
         '../stanfordPOStagger/stanford-postagger.jar',
         java_options='-mx2048m')
     #self.w2v_model = KeyedVectors.load_word2vec_format(
     #    "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz",
     #    binary=True)
     self.w2v_model = None
     self.text = self.get_text()
     self.anns = []
     self.idx_list = IdxList()
     self.punct = punctuation + '‘’— \t\n'
Esempio n. 54
0
def getDocExcerpt(docId, corpus):
    doc = getDoc(docId, corpus)

    if corpus == Corpus.COURSES:
        text = doc["descr"]
    elif corpus == Corpus.REUTERS:
        text = doc["body"]
    else:
        # Should never hit this
        sys.exit(-1)

    # https://www.nltk.org/api/nltk.tokenize.html
    # Create sentence classifier
    sent_detector = load('tokenizers/punkt/english.pickle')
    excerpt = sent_detector.tokenize(text)[0]

    return excerpt
Esempio n. 55
0
    def __str__(self):
        """Output the problem details in asciidoc"""
        out = []

        # Helpers
        def title(t, tier="=="):
            out.append("{0} {1}".format(tier, t))

        def block(t):
            out.append("****\n{0}\n****\n".format(t))

        # Create output
        title("Problem")
        out.append(self.text)

        if self.interpretation is not None:
            title("Interpretation")
            out.append(str(self.interpretation))

        if self.solution is not None:
            title("Solution")
            out.append(str(self.solution))

            title("Answer")
            block(self.solution.answer)

        if self.debug:
            title("Debugging")
            if self.sentence_tags is not None:
                # Display all the sentence tags
                title("Sentences", "===")
                for tags in self.sentence_tags:
                    block(str(tags))

                # Define what each tag means
                title("Tags", "===")
                tagdict = load('help/tagsets/upenn_tagset.pickle')
                for t in self.all_tags:
                    if not t in tagdict:
                        d = ("?", "No examples")
                    else:
                        d = tagdict[t]
                    block("*Tag '{0}'*: {1}\n\n{2}".format(t, d[0], d[1]))

        return "\n".join(out) + "\n"
Esempio n. 56
0
def parse_tweets_set(filename,
                     label,
                     word_tokenizer=None,
                     sent_tokenizer=None,
                     skip_header=True):
    """
    Parse csv file containing tweets and output data a list of (text, label) tuples.

    :param filename: the input csv filename.
    :param label: the label to be appended to each tweet contained in the csv file.
    :param word_tokenizer: the tokenizer instance that will be used to tokenize
        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
        If no word_tokenizer is specified, tweets will not be tokenized.
    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
        sentences.
    :param skip_header: if True, skip the first line of the csv file (which usually
        contains headers).

    :return: a list of (text, label) tuples.
    """
    tweets = []
    if not sent_tokenizer:
        sent_tokenizer = load("tokenizers/punkt/english.pickle")

    with codecs.open(filename, "rt") as csvfile:
        reader = csv.reader(csvfile)
        if skip_header == True:
            next(reader, None)  # skip the header
        i = 0
        for tweet_id, text in reader:
            # text = text[1]
            i += 1
            sys.stdout.write("Loaded {0} tweets\r".format(i))
            # Apply sentence and word tokenizer to text
            if word_tokenizer:
                tweet = [
                    w for sent in sent_tokenizer.tokenize(text)
                    for w in word_tokenizer.tokenize(sent)
                ]
            else:
                tweet = text
            tweets.append((tweet, label))

    print("Loaded {0} tweets".format(i))
    return tweets
Esempio n. 57
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where
        named entities are already tagged
        """
        try:
            os.path.isfile("processed_tuples.pkl")
            f = open("processed_tuples.pkl", "r")
            print "\nLoading processed tuples from disk..."
            self.processed_tuples = cPickle.load(f)
            f.close()
            print len(self.processed_tuples), "tuples loaded"

        except IOError:
            self.config.read_word2vec()
            tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
            print "\nGenerating relationship instances from sentences"
            f_sentences = codecs.open(sentences_file, encoding='utf-8')
            count = 0
            for line in f_sentences:
                if line.startswith("#"):
                    continue
                count += 1
                if count % 10000 == 0:
                    sys.stdout.write(".")

                sentence = Sentence(line.strip(), self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size, tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    self.processed_tuples.append(t)

            f_sentences.close()

            print "\n", len(self.processed_tuples), "tuples generated"
            print "Writing generated tuples to disk"
            f = open("processed_tuples.pkl", "wb")
            cPickle.dump(self.processed_tuples, f)
            f.close()
Esempio n. 58
0
def generate_tweet(bigram, trigram):
    tweet = ''
    word1 = ''
    word2 = START_TOKEN

    # Keeping adding words until we reach an end token
    while word2 != END_TOKEN:

        # First try to use the trigram
        choices = trigram[word1][word2]

        # Fallback on bigram if necessary
        if len(choices.items()) == 0:
            choices = bigram[word2]

        # Choose a new word based on the weighted values of the choices
        flat_choices = []
        for key, value in choices.items():
            flat_choices += [key] * value
        word3 = choice(flat_choices)
        tweet += word3 + ' '

        # Advance generator words
        word1 = word2
        word2 = word3

    # Reformat tweet
    tweet = tweet[:-(len(END_TOKEN) + 2)]  # Remove end token
    tweet = re.sub(r' !', '!', tweet)  # Join question marks
    tweet = re.sub(r' \?', '?', tweet)  # Join exclamation marks
    tweet = re.sub(r' \.', '.', tweet)  # Join periods marks

    # Capitalize sentences
    sentence_tokenizer = load('tokenizers/punkt/english.pickle')
    sentences = sentence_tokenizer.tokenize(tweet)
    sentences = [(sentence[0].upper() + sentence[1:])
                 for sentence in sentences]
    tweet = ' '.join(sentences)

    # Validate tweet
    is_valid_tweet = len(tweet) <= 280
    if is_valid_tweet:
        return tweet
    else:
        return generate_tweet(bigram, trigram)
Esempio n. 59
0
def get_num_pos_tags(train, test):
    # First find out which POS tags are possible
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    # CountVectorizer is used to create a vector for each tweet. Each number in this vector
    # represents the number of occurrences for a specific POS tag.
    # All those vectors have the same length, which is needed to use them for the SVM.
    vectorizer = CountVectorizer(vocabulary=vocabulary,
                                 tokenizer=lambda doc: doc,
                                 lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)

    return np.asarray(train_vector.toarray()), np.asarray(
        test_vector.toarray())
def extract_pos_tag(string):
    nltk.download('tagsets')
    nltk.download('averaged_perceptron_tagger')
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    keyList = []
    for key in tagdict.keys():
        keyList.append(key)

    skeleton_dict = {key: 0 for key in keyList}
    ts = nltk.word_tokenize(string)
    td = nltk.pos_tag(ts)

    sdc = skeleton_dict.copy()
    for i in range(len(td)):
        sdc[td[i][1]] = sdc[td[i][1]] + 1

    # return list(sdc.items())
    return [v for k, v in sdc.items()]