def tag(text): """Tags the input text. Arguments: text (str): The text to tag. Returns: ([[(str, str)]]): List of sentences containing lists of word/tag pairs. """ #Separate the input text into sentences sentences = nltk.sent_tokenize(str(text)) #Separate each sentence into words nested = [] for sentence in sentences: nested.append(nltk.word_tokenize(sentence)) # Prepare default tagger _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) # Same tagger as using nltk.pos_tag # Prepare regex tagger for custom tags regexp_tagger = nltk.tag.RegexpTagger([(r'\(', '('), (r'\)', ')'), (r'\[', '['), (r'\]', ']'), (r'_+', 'None')], backoff=tagger) #Add a part of speech tag to each word nested_tagged = [] for sentence in nested: nested_tagged.append([TaggedToken(*x) for x in regexp_tagger.tag(sentence)]) return nested_tagged
def __init__(self): # Initializing TreeBank tokenizer from NLTK from nltk.tokenize import TreebankWordTokenizer self._tb_tokenizer = TreebankWordTokenizer().tokenize # Initializing Punkt Sentence Tokenizer from NLTK from nltk import data self._sent_detector = data.load('tokenizers/punkt/english.pickle')
def read_rule (self, filename): rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8") lines = rules.split("\n") lines = [line for line in lines if line != ""] # remove blank lines lines = [line for line in lines if line[0] != "#"] # remove comments # NOTE: a simple but ugly hack to make this parser happy with double '\t's lines = [line.replace("\t\t", "\t") for line in lines] # parse rules rules = [] for line in lines: rule = [] tokens = line.split("\t") # text to be searched for at the end of the string rule.append( tokens[0][1:-1] ) # remove quotes # minimum stem size to perform the replacement rule.append( int(tokens[1]) ) # text to be replaced into rule.append( tokens[2][1:-1] ) # remove quotes # exceptions to this rule rule.append( [token[1:-1] for token in tokens[3].split(",")] ) # append to the results rules.append(rule) return rules
def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: print 111 training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) print 555 return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def batch_pos_tag(sentences): """ Use NLTK's currently recommended part of speech tagger to tag the given list of sentences, each consisting of a list of tokens. """ tagger = load(_POS_TAGGER) return tagger.batch_tag(sentences)
def _split_sentence(self, s): ''' sentence splitter ''' #use French sentence tokenizer from nltk pst = data.load("tokenizers/punkt/french.pickle") return pst.tokenize(s)
def generate_instances(self, sentences, child_conn): # Each process has its own NLTK PoS-tagger tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') instances = list() while True: try: s = sentences.get_nowait() if sentences.qsize() % 500 == 0: print(multiprocessing.current_process(), \ "Instances to process", sentences.qsize()) sentence = Sentence(s, self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) instances.append(t) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") pid = multiprocessing.current_process().pid child_conn.send((pid, instances)) break
def meaning_words(self, text): # meaning tags nouns and adjective only meaning_tags = ['NN', 'NNP', 'NNPS', 'JJ'] default_tagger = data.load(tag._POS_TAGGER) ''' sometimes the nltk tagger is misclassifying some part-of-speech such as The that should be a determiner. The duty tagger also helps to eliminate common words that are not so important ''' duty = dict() [duty.update({w:'x'}) for w in self.common_words] enchaned_tagger = tag.UnigramTagger(model=duty, backoff=default_tagger) meaning_words = ' '.join([w for w, c in enchaned_tagger.tag( word_tokenize(text)) if c in meaning_tags and (len(w) > 2)]) '''if no meaning words are found, using this approach then return the whole text ''' if not meaning_words: return None else: return meaning_words
def run(train, test, language, answer): results = {} if language == 'English': _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) elif language == 'Spanish': tagger = ut(cess_esp.tagged_sents()) elif language == 'Catalan': tagger = ut(cess_cat.tagged_sents()) for lexelt in train: train_features, y_train = extract_features(train[lexelt],language,tagger) test_features, _ = extract_features(test[lexelt],language,tagger) X_train, X_test = vectorize(train_features,test_features) X_train_new, X_test_new = feature_selection(X_train, X_test,y_train) results[lexelt] = classify(X_train_new, X_test_new,y_train) """ B1.c for lexelt in train: features = getBestWords(train[lexelt], 30) train_features = countFeature(features, train[lexelt]) _, y_train = extract_features(train[lexelt], language) test_features = countFeature(features, test[lexelt]) X_train, X_test = vectorize(train_features, test_features) results[lexelt] = classify(X_train, X_test, y_train) B1.c """ A.print_results(results, answer)
def test_austen(): from nltk.data import load from nltk.corpus import gutenberg as g stok = load('tokenizers/punkt/english.pickle') train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))] test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))] test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))] model1 = AdditiveSmoothing(n=2) model1.generate_model(train) print 'cross entropy additive smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2) model2 = KnesserNey(n=2) model2.generate_model(train) print 'cross entropy knesser-ney smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2) model3 = SimpleGoodTuring(n=2) model3.generate_model(train) print 'cross entropy simple good-turing smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2) model4 = KatzSmoothing(n=2) model4.generate_model(train) print 'cross entropy katz smoothing:' print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1) print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
def digest(self): if self.sentences is not None: return # Digest the problem into sentences tokenizer = data.load("tokenizers/punkt/english.pickle") self.sentences = tokenizer.tokenize(self.text.strip()) # Digest each sentence into words and part-of-speech tags if self.sentence_tags is None: sentence_tags = [] all_tags = [] all_words = [] for s in self.sentences: all_words.append(s) tags = pos_tag(word_tokenize(s)) sentence_tags.append(tags) for t in tags: l = len(t[0]) if not self.longest_word or self.longest_word < l: self.longest_word = l all_tags.append(t[1]) self.sentence_tags = sentence_tags self.all_tags = uniq(all_tags) self.all_words = uniq(all_words)
def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True): """ Parse csv file containing tweets and output data a list of (text, label) tuples. :param filename: the input csv filename. :param label: the label to be appended to each tweet contained in the csv file. :param word_tokenizer: the tokenizer instance that will be used to tokenize each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). If no word_tokenizer is specified, tweets will not be tokenized. :param sent_tokenizer: the tokenizer that will be used to split each tweet into sentences. :param skip_header: if True, skip the first line of the csv file (which usually contains headers). :return: a list of (text, label) tuples. """ tweets = [] if not sent_tokenizer: sent_tokenizer = load('tokenizers/punkt/english.pickle') # If we use Python3.x we can proceed using the 'rt' flag if sys.version_info[0] == 3: with codecs.open(filename, 'rt') as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for tweet_id, text in reader: # text = text[1] i += 1 sys.stdout.write('Loaded {0} tweets\r'.format(i)) # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [w for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent)] else: tweet = text tweets.append((tweet, label)) # If we use Python2.x we need to handle encoding problems elif sys.version_info[0] < 3: with codecs.open(filename) as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for row in reader: unicode_row = [x.decode('utf8') for x in row] text = unicode_row[1] i += 1 sys.stdout.write('Loaded {0} tweets\r'.format(i)) # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent)] else: tweet = text tweets.append((tweet, label)) print("Loaded {0} tweets".format(i)) return tweets
def sent_tokenize(text): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer`). """ tokenizer = load("tokenizers/punkt/english.pickle") return tokenizer.tokenize(text)
def load(self, loc): ''' :param loc: Load a pickled model at location. :type loc: str ''' self.model.weights, self.tagdict, self.classes = load(loc) self.model.classes = self.classes
def __init__(self, encoding): """Constructor. """ super(FrenchBonsaiTokenizer, self).__init__() self._sentence_tokenizer = data.load('tokenizers/punkt/french.pickle') self._encoding = encoding
def __init__(self): """ :param train_percent_size: 0-1 :return: """ _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' self._tagger = load(_POS_TAGGER)
def solve_problem(problem): tokenizer = load("tokenizers/punkt/english.pickle") sentences = tokenizer.tokenize(problem.strip()) print "Problem input: {0}".format(problem) for s in get_statements(sentences): print "Statement: {0}".format(str(s)) print "Solution: {0}".format(s.solve())
def load_parser(grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args): """ Load a grammar from a file, and build a parser based on that grammar. The parser depends on the grammar format, and might also depend on properties of the grammar itself. The following grammar formats are currently supported: - ``'cfg'`` (CFGs: ``CFG``) - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) :type grammar_url: str :param grammar_url: A URL specifying where the grammar is located. The default protocol is ``"nltk:"``, which searches for the file in the the NLTK data package. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :param parser: The class used for parsing; should be ``ChartParser`` or a subclass. If None, the class depends on the grammar format. :param chart_class: The class used for storing the chart; should be ``Chart`` or a subclass. Only used for CFGs and feature CFGs. If None, the chart class depends on the grammar format. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. Only used for probabilistic CFGs. :param load_args: Keyword parameters used when loading the grammar. See ``data.load`` for more information. """ grammar = load(grammar_url, **load_args) if not isinstance(grammar, CFG): raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") if isinstance(grammar, PCFG): if parser is None: parser = InsideChartParser return parser(grammar, trace=trace, beam_size=beam_size) elif isinstance(grammar, FeatureGrammar): if parser is None: parser = FeatureChartParser if chart_class is None: chart_class = FeatureChart return parser(grammar, trace=trace, chart_class=chart_class) else: # Plain CFG. if parser is None: parser = ChartParser if chart_class is None: chart_class = Chart return parser(grammar, trace=trace, chart_class=chart_class)
def split_sentences(corpus='rbc.txt', newfile='rbc_se.txt'): t = load('tokenizers/punkt/russian.pickle') text = open('.\\crawler\\' + corpus, 'r', encoding='utf-8') new = open(newfile, 'w', encoding='utf-8') for line in text: s = t.tokenize(line.strip('\n')) for sent in s: new.write(sent + '\n') text.close() new.close()
def treebank_tokenizer(sentence): tokenizer = load('data/german.pickle') treebank_word_tokenize = TreebankWordTokenizer().tokenize tokens = [] for s in tokenizer.tokenize(sentence): tokens.extend([token for token in treebank_word_tokenize(s)]) tokens = [ ''.join(i for i in s if i not in string.punctuation) for s in tokens ] tokens = list(filter(None, tokens)) return tokens
def ne_chunk_sents(tagged_sentences, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged sentences, each consisting of a list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse_sents(tagged_sentences)
def ne_chunk(tagged_tokens, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse(tagged_tokens)
def update_attributes(self, settingfile_input): searchURL = self.http + "/search" feature_service = "Feature Service" query_dict = {'f': 'json', 'token': self.token, 'q': "tags:\"" + self.utag + "\" AND owner:\"" + self.username + "\" AND type:\"" + feature_service + "\""} jsonResponse = sendAGOLReq(searchURL, query_dict) if jsonResponse['total'] == 0: #feature_id = jsonResponse['results'][0]['id'] DirMGMT().lgr.error("\n.Couldn't find the service.\n") sys.exit() else: #jsonResponse = sendAGOLReq(searchURL, query_dict) feature_id = jsonResponse['results'][0]['id'] # Update updateURL = agol.http + '/content/users/{}/items/{}/update'.format(agol.username, feature_id) sentence_break = data.load('tokenizers/punkt/english.pickle') temp_desc = ReadSF(settingfile_input).description utagloc = temp_desc.find('uTag') cut = temp_desc[utagloc:utagloc+42] temp_desc = temp_desc.replace(cut, '') # TODO remove tags from temp_tags = ReadSF(settingfile_input).tags # utag = temp_tags.split()[-1] # lutag = temp_tags.rfind(utag)-2 # temp_tags = temp_tags[0:lutag] url = updateURL + "?f=json&token=" + agol.token + \ "&type=Feature Service" \ "&title=" + agol.serviceName.replace('_', ' ') + \ "&tags=" + temp_tags + \ "&snippet=" + sentence_break.tokenize(ReadSF(settingfile_input).description.strip())[0] + \ "&description=" + temp_desc # "&description=" + ReadSF(settingfile_input).description.replace("\n\nuTag: "+ReadSF(settingfile_input).tags[-1], '') response = requests.post(url) itemPartJSON = json.loads(response.text) if "success" in itemPartJSON: # itemPartID = itemPartJSON['id'] itemPartTitle = itemPartJSON['id'] DirMGMT().lgr.info("updated Feature Layer: {}".format(itemPartTitle)) return True else: DirMGMT().lgr.error("\n.sd file not uploaded. Check the errors and try again.\n") DirMGMT().lgr.error(itemPartJSON) sys.exit()
def sent_tokenize(text, language="english"): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ tokenizer = load("tokenizers/punkt/{0}.pickle".format(language)) return tokenizer.tokenize(text)
def __init__(self, language): """ :param str language: ISO 639-1 language code. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes """ self.language = language model = self.supported_models.get(language) if model: self.splitter = load(model) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (language, self.supported_models.keys()))
def read_training_data(training_file): """ Extracts part-of-speech (POS) tag, transition between tags, and emission counts from a tagged training corpus. The POS tag count keeps track of the number of times a given POS tag occurs in the training data. This is stored in a dictionary with POS tag keys and integer count values. The transition counts keep track of how often the first tag is followed by a second tag. This is stored in a dictionary with tuple(tag1, tag2) keys and the number of times tag2 is followed by tag1 values. The emission count keeps track of the number of times a word and its associated tag occurs in the data. This is stored in a dictionary with tuple(word, POS tag) keys and integer count values. The training file is expected to be a training set of POS-tagged sentences, separated by newline characters. Additional custom tags, "START" and "END", are included to indicate the start and end of each sentence. :param training_file: the location of the training file :return: a tuple of dictionaries tracking tag counts, transition counts, and emission counts """ tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys()) + [ "START", "END", "-LRB-", "-RRB-", "#" ] tag_types = [x for x in tag_types if x not in ["(", ")", "--"] ] # The tagset in nltk uses different notations tag_type_permutations = list(product(tag_types, repeat=2)) tag_counts = dict.fromkeys(tag_types, 0) transition_counts = dict.fromkeys(tag_type_permutations, 0) emission_counts = {} with open(training_file, "r") as training_data: for line in tqdm(training_data, total=rawcount(training_file), desc="Training"): tagged_tokens = tuple( str2tuple(tagged_token) for tagged_token in line.split()) tag_sequence = ("START", ) + tuple( tagged_token[1] for tagged_token in tagged_tokens) + ("END", ) for tag in tag_sequence: tag_counts[tag] += 1 for tag_pair in pairwise(tag_sequence): transition_counts[tag_pair] += 1 for tagged_token in tagged_tokens: if tagged_token in emission_counts: emission_counts[tagged_token] += 1 else: emission_counts[tagged_token] = 1 return tag_counts, transition_counts, emission_counts
def sent_tokenize(text, language='english'): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language)) return tokenizer.tokenize(text)
def __init__(self, language): """ :param str language: ISO 639-1 language code. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes """ self.language = language model = self.supported_models.get(language) if model: self.splitter = load(model) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( language, self.supported_models.keys()))
def _load_universal_map(fileid): mapping = {} contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text") for line in contents.splitlines(): line = line.strip() if line == '': continue fine, coarse = line.split('\t') assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse) assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine) _MAPPINGS[fileid]['universal'][fine] = coarse
def __init__(self): self.GRAPHENE_SERVICE = "http://nietzsche.fim.uni-passau.de:8080/simplification/text" self.premiseIndicators = self.read_key_words( "resources/premise_indicator.txt") self.claimIndicators = self.read_key_words( "resources/claim_indicator.txt") self.tagdict = load('help/tagsets/upenn_tagset.pickle') self.lb = preprocessing.LabelBinarizer() self.lb.fit(list(self.tagdict.keys())) self.nlp = spacy.load('en') self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format( 'resources/GoogleNews-vectors-negative300.bin.gz', binary=True)
def construct_graph(document): sentence_detector = data.load('tokenizers/punkt/english.pickle') sentences = sentence_detector.tokenize(document) nodes = [Node(sentence) for sentence in sentences] for idx1 in range(len(nodes)): print idx1, len(nodes) for idx2 in range(idx1 + 1, len(nodes)): node1, node2 = nodes[idx1], nodes[idx2] edge_weight = cosine_distance(node1.value, node2.value) node1.connect(node2, edge_weight) return nodes
def pos_freqs(texts): tagdict = load('help/tagsets/upenn_tagset.pickle') keys = tagdict.keys() key_list = list(keys) freqs_array = np.zeros((len(texts), len(key_list)), dtype=np.int) for i, text in enumerate(texts): tags = pos_tagger(text) for j, key in enumerate(key_list): freqs_array[i, j] = len([tag for tag in tags if tag == key]) return freqs_array
def _format_tagset(tagset, tagpattern=None): tagdict = load("help/tagsets/" + tagset + ".pickle") if not tagpattern: _print_entries(sorted(tagdict), tagdict) elif tagpattern in tagdict: _print_entries([tagpattern], tagdict) else: tagpattern = re.compile(tagpattern) tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] if tags: _print_entries(tags, tagdict) else: print("No matching tags found.")
def _load_universal_map(fileid): mapping = {} contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text") for line in contents.splitlines(): line = line.strip() if line == "": continue fine, coarse = line.split("\t") assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(coarse) assert fine not in _MAPPINGS[fileid]["universal"], "Multiple entries for original tag: {}".format(fine) _MAPPINGS[fileid]["universal"][fine] = coarse
def srcparse(src): tokenizer = load("tokenizers/punkt/english.pickle") sentences = tokenizer.tokenize(src.strip().lower()) bs = compile(r'\d*:\d*') rm = compile(r'[*.?!,\'":;\(\)<>]') sp = compile(r'[\-\+]') starts, joins, ends = [], {}, [] for sentence in sentences: # Format the sentence wlist = word_tokenize(bs.sub(" ", sp.sub(" ", rm.sub("", sentence.replace("\n", " "))))) if len(wlist) < 3: # Ignore sentences without triples in the corpus continue # Add the sentence starting word starts.append(wlist[0]) # Inverse the list so we can build from the ending wlist.reverse() for i in range(len(wlist) - 2): w1 = wlist[i] w2 = wlist[i + 1] w3 = wlist[i + 2] # Handle zero-length breaks in the corpus. if 0 in [len(w1), len(w2), len(w3)]: continue # Generate a list of words which can start a sentence properly if i == 0: ends.append(w1) # Store doubles try: joins[w1].append(w2) except KeyError: joins[w1] = [w2] # Store triples key = (w1, w2) try: joins[key].append(w3) except KeyError: joins[key] = [w3] return starts, joins, ends
def srcparse(src): tokenizer = load("tokenizers/punkt/english.pickle") sentences = tokenizer.tokenize(src.strip().lower()) bs = compile(r'\d*:\d*') rm = compile(r'[*.?!,\'":;\(\)<>]') sp = compile(r'[\-\+]') starts, joins, ends = [], {}, [] for sentence in sentences: # Format the sentence wlist = word_tokenize( bs.sub(" ", sp.sub(" ", rm.sub("", sentence.replace("\n", " "))))) if len(wlist) < 3: # Ignore sentences without triples in the corpus continue # Add the sentence starting word starts.append(wlist[0]) # Inverse the list so we can build from the ending wlist.reverse() for i in range(len(wlist) - 2): w1 = wlist[i] w2 = wlist[i + 1] w3 = wlist[i + 2] # Handle zero-length breaks in the corpus. if 0 in [len(w1), len(w2), len(w3)]: continue # Generate a list of words which can start a sentence properly if i == 0: ends.append(w1) # Store doubles try: joins[w1].append(w2) except KeyError: joins[w1] = [w2] # Store triples key = (w1, w2) try: joins[key].append(w3) except KeyError: joins[key] = [w3] return starts, joins, ends
def main(): parser = argparse.ArgumentParser(description="yields uni- and bigrams for pmi models.") parser.add_argument('-s', '--stopwords', type=str, help='filter stopwords') parser.add_argument('-d', '--digits', action="store_true", default=False, help="remove digits") parser.add_argument('-p', '--punctuation', action="store_true", default=False, help="remove punctuation") parser.add_argument('-l', '--length', type=int, default=None, help="minimum word length") parser.add_argument('-t', '--tags', nargs='+', default=set(), type=str, help="specify forbidden pos tags") parser.add_argument('-S', '--sentence-tok', action="store_true", default=False, help="split document into sentences. Co-occurrence boundary is then within sentences") parser.add_argument('--lowercase', action="store_true", default=False, help="lowercase input") parser.add_argument('--tokenize', action="store_true", default=False, help="tokenize input. necessary for pos-tagging.") parser.add_argument('-w', '--window-size', type=int, help="set co-occurence boundary to a window of x terms") parser.add_argument('--stemming', action="store_true", default=False, help="perform stemming with the porter2 stemming algorithm") parser.add_argument('-u', '--unicode', action="store_true", default=False, help="use Unicode input/output and filter on unicode categories") parser.add_argument('--unidecode', action="store_true", default=False, help="convert unicode symbols to ASCII symbols if possible (using Unidecode package)") args = parser.parse_args() sys.stderr.write(str(args)+"\n") # arg namepace fd = sys.stdin out = sys.stdout if args.unidecode or args.unicode: fd = codecs.getreader('utf-8')(sys.stdin) out = codecs.getwriter('utf-8')(sys.stdout) global pos_tagger global sen_tagger pos_tagger, sen_tagger = None, None excluded = set() if args.stopwords: excluded |= load_stopwords(args.stopwords) if args.punctuation: excluded |= punct if args.tags: pos_tagger = data.load('file:postagger', format="pickle") if args.sentence_tok: sen_tagger = data.load('file:sentencetokenizer', format="pickle") for w in yieldWords(fd, args, excluded): out.write(w + "\n")
def __init__(self, word_embeddings, seq_length=1000, stopwords='default'): """ Initialises the embedder class. Expects a WordEmbeddings object. """ self.embeddings = word_embeddings self.MAX_SEQUENCE_LENGTHS = seq_length if (stopwords == 'default'): self.STOPWORDS = STOPWORDS else: self.STOPWORDS = stopwords self.postags = load('help/tagsets/upenn_tagset.pickle')
def get_tags_bow(sentences): if os.path.isfile(TAGS_BOW): return data.get_pickle(TAGS_BOW) else: from collections import Counter from nltk.data import load corpus = list(load('help/tagsets/upenn_tagset.pickle').keys()) f = lambda x: Counter([y for y in x if y in corpus]) df = pd.DataFrame({"tags": sentences}) df["bow"] = (pd.DataFrame(df["tags"].apply(f).values.tolist()).reindex( columns=corpus).fillna(0).astype(int).values.tolist()) result = df["bow"].tolist() data.save_pickle(TAGS_BOW, result) return result
def generate_vocab_pos_upenn(): # Getting tags from upenn_tagset nltk.download('tagsets', quiet=True) tagdict = load('help/tagsets/upenn_tagset.pickle') # creating dictionary with pos_tags, using negative numbers pos_dic = dict(enumerate(list(set(tagdict.keys())))) pos_dictionary = {v: -(k + 1) for k, v in pos_dic.items()} # with open(pos_vocabulary_pkl , 'wb') as output: # pickle.dump(pos_dictionary, output, pickle.HIGHEST_PROTOCOL) # print("Pos vocabulary saved as pkl") return pos_dictionary
def getSentencesFromFiles(dataDir): '''Read text files in a directory and return a list of all sentences Doing this for huge data will be suicidal ''' sentDetector = ntd.load('tokenizers/punkt/english.pickle') allSentences=[] for folderName, subfolders, filenames in os.walk(dataDir): for file in filenames: print("Extracting Sentences from file ",file) text = open(dataDir+'\\'+file,encoding='utf-8') sentence = sentDetector.tokenize(text.read()) allSentences.extend(sentence) return allSentences
def prediction_neighbor_with_pos(lines_with_unknown, word2vec_model, NN_model): """ Return the prevision for each unknown word based on the similarities with the nearest neighbours. This time, we used the lstm pos model to select the word based on its POS tag. :param lines_with_unknown: list of string lines with token words. :param word2vec_model: the word2vec model that we used to get similar words. :param lstm_model: we use this one to predict the pos tag of the word. :return: predicted words. """ predicted_words = [] # List of all the predicted words tagdict = load('help/tagsets/upenn_tagset.pickle') list_tags = list(tagdict.keys()) # Get the list of all the tags. for i in range(len(lines_with_unknown)): line = lines_with_unknown[i] if 'unk' in line: # If the line contains the word 'unk' index = line.index('unk') neighbours_words = [ line[i] for i in (index - 2, index - 1, index + 1, index + 2) ] # Extract the words around most_similar_list = word2vec_model.most_similar( positive=neighbours_words)[:10] sample = [] for word in neighbours_words: # Format the neighbouring words for the Neural Network sample.append(one_hot_encoding(word, list_tags).tolist()) Y_pos = NN_model.predict(np.array(sample).reshape( (1, 4, 45))) # Predict the vector of POS tag id_pos = np.argmax(Y_pos) # Take the id pos_tag = list_tags[ id_pos] # We got now the POS tag which is predicted, we can get a more accurate prediction # We then check if there is a word if the corresponding POS tag among the top 10, best_candidate = [] for i in range(len(most_similar_list)): word = most_similar_list[i][0] if nltk.pos_tag([word]) == pos_tag: best_candidate.append(word) if best_candidate: # If the list is not empty predicted_words.append( best_candidate[0]) # Take the first element else: predicted_words.append( most_similar_list[0] [0]) # Otherwise we just take the first element return predicted_words
def __init__(self): current_dir = os.path.dirname(inspect.stack()[0][1]) parent_dir = current_dir.rsplit('/', 1)[0] self.GRAPHENE_SERVICE = "http://nietzsche.fim.uni-passau.de:8080/simplification/text" self.premiseIndicators = self.read_key_words( (parent_dir + "/resources/premise_indicator.txt")) self.claimIndicators = self.read_key_words( (parent_dir + "/resources/claim_indicator.txt")) self.tagdict = load('help/tagsets/upenn_tagset.pickle') self.lb = preprocessing.LabelBinarizer() self.lb.fit(list(self.tagdict.keys())) self.nlp = spacy.load('en') self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format( (parent_dir + '/resources/GoogleNews-vectors-negative300.bin.gz'), binary=True)
def _load_universal_map(fileid): contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text") _MAPPINGS[fileid]["universal"].default_factory = lambda: "X" for line in contents.splitlines(): line = line.strip() if line == "": continue fine, coarse = line.split("\t") assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format( coarse) assert (fine not in _MAPPINGS[fileid]["universal"] ), "Multiple entries for original tag: {}".format(fine) _MAPPINGS[fileid]["universal"][fine] = coarse
def _load_universal_map(fileid): contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text") # When mapping to the Universal Tagset, # map unknown inputs to 'X' not 'UNK' _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X' for line in contents.splitlines(): line = line.strip() if line == '': continue fine, coarse = line.split('\t') assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse) assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine) _MAPPINGS[fileid]['universal'][fine] = coarse
def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged :param sentences_file: """ if os.path.exists("processed_tuples.pkl"): with open("processed_tuples.pkl", "rb") as f_in: print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f_in) print(len(self.processed_tuples), "tuples loaded") else: # load needed stuff, word2vec model and a pos-tagger self.config.read_word2vec() tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') print("\nGenerating relationship instances from sentences") with open(sentences_file, encoding='utf-8') as f_sentences: count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) print("\n", len(self.processed_tuples), "tuples generated") print("Writing generated tuples to disk") with open("processed_tuples.pkl", "wb") as f_out: pickle.dump(self.processed_tuples, f_out)
def get_subjectivity_analyzer(lang): try: sa_subj_data_file_path = 'nltk_data/sa_subjectivity.pickle' sentim_analyzer = load(DEFAULT_PROJECT_PATH + sa_subj_data_file_path) except LookupError: my_print( '{}Cannot find the sentiment analyzer you want to load.'.format( WARNING_FLAG)) my_print( '{}Training & save a new one using NaiveBayesClassifier.'.format( WARNING_FLAG)) sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) return sentim_analyzer
def POS_tagging(essay): """ Parts of speech tagging done. Net count of each is returned. """ POS_dict = {} for i, j in nltk.pos_tag(essay): if j in POS_dict: POS_dict[j] += 1 else: POS_dict[j] = 1 tagdict = load('help/tagsets/upenn_tagset.pickle') for i in tagdict: if i not in POS_dict: POS_dict[i] = 0 return POS_dict
def __init__(self, filename): self.filename = filename self.tokenizer = TreebankWordTokenizer() self.sent_tokenizer = load( 'tokenizers/punkt/{0}.pickle'.format('english')) self.st = StanfordPOSTagger( '../stanfordPOStagger/english-bidirectional-distsim.tagger', '../stanfordPOStagger/stanford-postagger.jar', java_options='-mx2048m') #self.w2v_model = KeyedVectors.load_word2vec_format( # "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz", # binary=True) self.w2v_model = None self.text = self.get_text() self.anns = [] self.idx_list = IdxList() self.punct = punctuation + '‘’— \t\n'
def getDocExcerpt(docId, corpus): doc = getDoc(docId, corpus) if corpus == Corpus.COURSES: text = doc["descr"] elif corpus == Corpus.REUTERS: text = doc["body"] else: # Should never hit this sys.exit(-1) # https://www.nltk.org/api/nltk.tokenize.html # Create sentence classifier sent_detector = load('tokenizers/punkt/english.pickle') excerpt = sent_detector.tokenize(text)[0] return excerpt
def __str__(self): """Output the problem details in asciidoc""" out = [] # Helpers def title(t, tier="=="): out.append("{0} {1}".format(tier, t)) def block(t): out.append("****\n{0}\n****\n".format(t)) # Create output title("Problem") out.append(self.text) if self.interpretation is not None: title("Interpretation") out.append(str(self.interpretation)) if self.solution is not None: title("Solution") out.append(str(self.solution)) title("Answer") block(self.solution.answer) if self.debug: title("Debugging") if self.sentence_tags is not None: # Display all the sentence tags title("Sentences", "===") for tags in self.sentence_tags: block(str(tags)) # Define what each tag means title("Tags", "===") tagdict = load('help/tagsets/upenn_tagset.pickle') for t in self.all_tags: if not t in tagdict: d = ("?", "No examples") else: d = tagdict[t] block("*Tag '{0}'*: {1}\n\n{2}".format(t, d[0], d[1])) return "\n".join(out) + "\n"
def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True): """ Parse csv file containing tweets and output data a list of (text, label) tuples. :param filename: the input csv filename. :param label: the label to be appended to each tweet contained in the csv file. :param word_tokenizer: the tokenizer instance that will be used to tokenize each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). If no word_tokenizer is specified, tweets will not be tokenized. :param sent_tokenizer: the tokenizer that will be used to split each tweet into sentences. :param skip_header: if True, skip the first line of the csv file (which usually contains headers). :return: a list of (text, label) tuples. """ tweets = [] if not sent_tokenizer: sent_tokenizer = load("tokenizers/punkt/english.pickle") with codecs.open(filename, "rt") as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for tweet_id, text in reader: # text = text[1] i += 1 sys.stdout.write("Loaded {0} tweets\r".format(i)) # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [ w for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent) ] else: tweet = text tweets.append((tweet, label)) print("Loaded {0} tweets".format(i)) return tweets
def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged """ try: os.path.isfile("processed_tuples.pkl") f = open("processed_tuples.pkl", "r") print "\nLoading processed tuples from disk..." self.processed_tuples = cPickle.load(f) f.close() print len(self.processed_tuples), "tuples loaded" except IOError: self.config.read_word2vec() tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') print "\nGenerating relationship instances from sentences" f_sentences = codecs.open(sentences_file, encoding='utf-8') count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) f_sentences.close() print "\n", len(self.processed_tuples), "tuples generated" print "Writing generated tuples to disk" f = open("processed_tuples.pkl", "wb") cPickle.dump(self.processed_tuples, f) f.close()
def generate_tweet(bigram, trigram): tweet = '' word1 = '' word2 = START_TOKEN # Keeping adding words until we reach an end token while word2 != END_TOKEN: # First try to use the trigram choices = trigram[word1][word2] # Fallback on bigram if necessary if len(choices.items()) == 0: choices = bigram[word2] # Choose a new word based on the weighted values of the choices flat_choices = [] for key, value in choices.items(): flat_choices += [key] * value word3 = choice(flat_choices) tweet += word3 + ' ' # Advance generator words word1 = word2 word2 = word3 # Reformat tweet tweet = tweet[:-(len(END_TOKEN) + 2)] # Remove end token tweet = re.sub(r' !', '!', tweet) # Join question marks tweet = re.sub(r' \?', '?', tweet) # Join exclamation marks tweet = re.sub(r' \.', '.', tweet) # Join periods marks # Capitalize sentences sentence_tokenizer = load('tokenizers/punkt/english.pickle') sentences = sentence_tokenizer.tokenize(tweet) sentences = [(sentence[0].upper() + sentence[1:]) for sentence in sentences] tweet = ' '.join(sentences) # Validate tweet is_valid_tweet = len(tweet) <= 280 if is_valid_tweet: return tweet else: return generate_tweet(bigram, trigram)
def get_num_pos_tags(train, test): # First find out which POS tags are possible vocabulary = list(load('help/tagsets/upenn_tagset.pickle')) pos_tags_train = [tweet.pos_tags for tweet in train] pos_tags_test = [tweet.pos_tags for tweet in test] # CountVectorizer is used to create a vector for each tweet. Each number in this vector # represents the number of occurrences for a specific POS tag. # All those vectors have the same length, which is needed to use them for the SVM. vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False) train_vector = vectorizer.transform(pos_tags_train) test_vector = vectorizer.transform(pos_tags_test) return np.asarray(train_vector.toarray()), np.asarray( test_vector.toarray())
def extract_pos_tag(string): nltk.download('tagsets') nltk.download('averaged_perceptron_tagger') tagdict = load('help/tagsets/upenn_tagset.pickle') keyList = [] for key in tagdict.keys(): keyList.append(key) skeleton_dict = {key: 0 for key in keyList} ts = nltk.word_tokenize(string) td = nltk.pos_tag(ts) sdc = skeleton_dict.copy() for i in range(len(td)): sdc[td[i][1]] = sdc[td[i][1]] + 1 # return list(sdc.items()) return [v for k, v in sdc.items()]