class TokenizingEmbeddingVectorizer: """ This vectorizer first tokenizes its input. """ def __init__(self, embedding_file, ignored_tokens=set()): self.vectorizer = EmbeddingVectorizer(embedding_file, ignored_tokens) self.tokenizer = TreebankWordTokenizer() self.embeddings = self.vectorizer.embeddings self.token2Index = self.vectorizer.token2Index def tokenize_sentences(self, sentences): tokenized_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences)) return tokenized_sentences def prepare_data(self, sentences, labels): prepared_sentences = list(map(lambda sentence: " ".join(self.tokenizer.tokenize(sentence)), sentences)) return self.vectorizer.prepare_data(prepared_sentences, labels) def sentences_to_padded_indices(self, sentences, max_length, padding="pre"): return self.vectorizer.sentences_to_padded_indices(sentences, max_length, padding) def sentences_to_indices(self, sentences): return np.array(self.vectorizer.sentences_to_indices(sentences))
def tokenize_text(text, language="english"): '''Tokenize a string into a list of tokens. Use NLTK's Treebankwordtokenizer. Note that we first split into sentences using NLTK's sent_tokenize. We additionally call a filtering function to remove un-wanted tokens. IN: - text, str OUT: - list of strings ''' ## list of tokens list_tokens = [] ## split text into sentences sentences = sent_tokenize(text, language=language) ## define the tokenizer tokenizer = TreebankWordTokenizer() ## loop over all sentences for sent in sentences: ## tokenize the sentence sent_tokenized = tokenizer.tokenize(sent) ## lowercase the tokens ## add tokens to list of tokens list_tokens += sent_tokenized list_tokens = filter_tokens(list_tokens) return list_tokens
def create_data(stories, lang="english", doc_limit=-1, delimiter=""): from nltk.tokenize.treebank import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() from nltk.corpus import stopwords stop = stopwords.words('english') from string import ascii_lowercase docs = {} print("Found %i stories" % stories.count()) for story in stories: text = zlib.decompress(story.story_content_z) # text = story.story_title text = ''.join( BeautifulSoup(text, features="lxml").findAll(text=True)).lower() if delimiter: sections = text.split(delimiter) else: sections = [text] if doc_limit > 0 and len(docs) > doc_limit: print("Passed doc limit %i" % len(docs)) break print(story.story_title, len(sections)) for jj in xrange(len(sections)): docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \ if (not x in stop) and \ (min(y in ascii_lowercase for y in x))] return docs
def get_tf_idf_score(self, sentence, mode, ngram=1): if ngram not in range(1, 4): try: raise ValueError except ValueError as v: print "Only unigrams, bigrams and trigrams are supported." if mode != "lex" and mode != "pos": try: raise ValueError except ValueError as v: print "Only lexical and POS distinctness supported." if len(self.document_freqs_lex.keys()) == 0 or len( self.document_freqs_pos.keys()) == 0: try: raise AttributeError except AttributeError as ae: print "Document frequency dictionaries not initialized. Call load_doc_freqs() " \ "on the LM object." tokenizer = TreebankWordTokenizer() sentence = sentence.lower() tokens = tokenizer.tokenize(sentence) tokens = self.__fix_tokens(tokens) tags = nltk.pos_tag(tokens) tags = self.__add_start_end_tags(tags) if mode == "lex": score = self.__get_lex_tf_idf(tags, ngram) return score else: score = self.__get_pos_tf_idf(tags, ngram) return score
def __init__(self): ''' Constructor ''' self.__tokenizer = TreebankWordTokenizer() self.__r_end_sentence = re.compile(r"\.|\?|!")
def tokenize(documents): real_tokens = [] documents2 = [] tbw = TreebankWordTokenizer() for doc in documents: text = doc["text"] file = doc["id"] text = text.replace("\"","'") #text = text.replace("/", " ") text = text.replace("-", " ") text = text.replace(".", " ") tokens = tbw.span_tokenize(text) for token in tokens: token_txt = text[token[0]:token[1]] found = False for tag in doc["tags"]: if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]: token_tag = tag["tag"] token_tag_type = tag["type"] found = True if found==False: token_tag = "O" token_tag_type = "O" real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type}) documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens}) return documents2
def create_data(stories, lang="english", doc_limit=-1, delimiter=""): from nltk.tokenize.treebank import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() from nltk.corpus import stopwords stop = stopwords.words('english') from string import ascii_lowercase docs = {} print("Found %i stories" % stories.count()) for story in stories: text = zlib.decompress(story.story_content_z) # text = story.story_title text = ''.join(BeautifulSoup(text).findAll(text=True)).lower() if delimiter: sections = text.split(delimiter) else: sections = [text] if doc_limit > 0 and len(docs) > doc_limit: print("Passed doc limit %i" % len(docs)) break print(story.story_title, len(sections)) for jj in xrange(len(sections)): docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \ if (not x in stop) and \ (min(y in ascii_lowercase for y in x))] return docs
def tokenize(): text = request.json["text"] try: spans = list(TreebankWordTokenizer().span_tokenize(text)) except LookupError: nltk.download('punkt') spans = list(TreebankWordTokenizer().span_tokenize(text)) return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
def text2sentences(path): # feel free to make a better tokenization/pre-processing sentences = [] tokenizer = TreebankWordTokenizer() with open(path , encoding = 'utf8') as f: for l in f: table = str.maketrans(dict.fromkeys(string.punctuation + '0123456789')) #to remove numbers & punctuation sentences.append( tokenizer.tokenize(l.translate(table).lower()) ) return sentences
def treebank_tokenizer(sentence): # split 's but also split <>, wait to use in further work t = TreebankWordTokenizer() word_lst = t.tokenize(sentence.lower().replace("<", "LAB_").replace( ">", "_RAB")) ret = [] for w in word_lst: ret.append(w.replace("LAB_", "<").replace("_RAB", ">")) return ret
def read(fn, test_percentage, maxlen, max_features, dataset_type): """ :param fn: dataset filename. :param maxlen: maximum length for each sentence. :param max_features: max_features (e.g., unique words, vocabulary) :param padding: If true, padding will be made starting and ending of each sentence. :return: """ tokenizer = TreebankWordTokenizer() c = count(2) word_idx = {} try: lines = codecs.open(fn, encoding='utf8').read().splitlines() except UnicodeDecodeError: lines = codecs.open(fn).read().splitlines() y = [] X = [] for line in lines: try: label, sentence = line.split('\t') except ValueError: continue y.append(label) s = [] for token in tokenizer.tokenize(sentence): idx = word_idx.get(token, None) if idx is None: idx = c.next() if idx < max_features: word_idx[token] = idx else: idx = 1 s.append(idx) X.append(s) X = sequence.pad_sequences(X, maxlen=maxlen) num_instance_for_train = int(len(X) * (1 - test_percentage)) # convert labels into floats if the labels are real-valued. if dataset_type == 'regression': y = map(lambda e: float(e), y) else: set_y = set(y) print >> sys.stderr, set_y label1, label2 = set_y # now supporting only binary classification. labels = {label1: 0, label2: 1} y = map(lambda e: labels[e], y) # map labels 0/1. y = np.array(y) print "training set size {}, test set size {}".format(num_instance_for_train, max(len(X) - num_instance_for_train, 0)) return (X[:num_instance_for_train, :], y[:num_instance_for_train]), (X[num_instance_for_train:, :], y[num_instance_for_train:]), word_idx
def english_tokenization(term): word_tokenizer = TreebankWordTokenizer() tokenized_term = "" for word in word_tokenizer.tokenize(term): if tokenized_term != "": tokenized_term += " " tokenized_term += word return tokenized_term
async def tokenize(request: Request): body = await request.json() text = body["text"] print(text) try: spans = list(TreebankWordTokenizer().span_tokenize(text)) except LookupError: nltk.download('punkt') spans = list(TreebankWordTokenizer().span_tokenize(text)) return {"tokens": [(s[0], s[1], text[s[0]:s[1]]) for s in spans]}
def transform_texts(art, period, site, ngrams=1, mod=None, text_column='text', text_token_column='text_token', remain_columns=('author', 'site', 'link')): """Transform dataframe with texts, create tokenized lists in columns. Save dataframe to mod directory, if mod is not None.""" text_column_paragraphs = text_column + '_paragraphs' text_token_column_lower = text_token_column + '_lower' text_token_column_stemmed = text_token_column + '_stemmed' text_token_column_count = text_token_column + '_count' st = SnowballStemmer('english') art.dropna(subset=[text_column], inplace=True) # maketrans fails if there are nans art_sh = art[list((text_column, ) + remain_columns)].copy() # we don't need more columns del art gc.collect() additional_punctuation = string.punctuation + '«»…—’‘“”–•' # a few additional, non-ascii chars # gigaom tt = TreebankWordTokenizer() art_sh[text_column] = art_sh[text_column].apply( lambda x: x.replace('Tweet\nShare\nPost\n', '').replace( '“', '').replace('”', '').replace('’', '\'')) # sent_tokenize tokenizes by paragraphs art_sh[text_column_paragraphs] = art_sh[text_column].apply( lambda x: x.split('\n\n')) art_sh[text_token_column] = art_sh[text_column_paragraphs].apply( lambda x: [flatten([tt.tokenize(z) for z in sent_tokenize(y)]) for y in x]) # to lower, stem art_sh[text_token_column_lower] = art_sh[text_token_column].apply( lambda x: [[word.lower() for word in paragraph] for paragraph in x]) art_sh[text_token_column_stemmed] = art_sh[text_token_column_lower].apply( lambda x: [[st.stem(word) for word in paragraph] for paragraph in x]) if ngrams == 2: # convert to bigrams art_sh[text_token_column] = art_sh[text_token_column_lower].apply( to_bigram) art_sh[text_token_column_lower] = art_sh[ text_token_column_lower].apply(to_bigram) art_sh[text_token_column_stemmed] = art_sh[ text_token_column_stemmed].apply(to_bigram) art_sh[text_token_column_count] = art_sh[text_token_column_stemmed].apply( lambda x: dict(Counter(FreqDist(flatten(x))))) if mod is not None: art_sh.to_csv(mod + 'dfs_articles' + period + site + '.csv') return art_sh
def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"): self.token2idx = {} self.tokenizer = None if tokenizer_method == "TreebankWordTokenizer": self.tokenizer = TreebankWordTokenizer() else: raise NotImplementedError( "tokenizer_method {} doesn't exist".format(tokenizer_method)) self.add_token(UNK_TOKEN) # Add UNK token
class DocumentTokenizer(object): """ Used to split a document into sentences and tokens. Returns a list of lists TODO """ def __init__(self, sent_tokenizer=None, word_tokenizer=None): if not sent_tokenizer: #self.sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv') self.sent_tokenizer = DefaultSentenceTokenizer() if not word_tokenizer: self.word_tokenizer = TreebankWordTokenizer() #self.rush = rush #self.word_tokenizer = word_tokenizer def tokenize_doc(self, doc): """ Takes raw string. Returns a list of lists where each list is the sentence, and each sentence contains two-tuples of tokens and spans. """ tokenized_sents_and_spans = [] try: # sentence_span is a list of tuples of spans sentence_spans = self.sent_tokenizer.tokenize_sents(doc) except Exception as e: raise e return [] #raise e for start, end in sentence_spans: sentence = doc[start:end] tokenized_sents_and_spans.append( self.tokenize_sent(sentence, start)) return tokenized_sents_and_spans def tokenize_sent(self, sentence, offset): try: tokens = self.word_tokenizer.tokenize(sentence) except Exception as e: print("Word tokenizing failed") print(sentence) raise e try: spans = self.word_tokenizer.span_tokenize(sentence) except Exception as e: print("Span tokenizing failed") print(sentence) raise e tokens_and_spans = [] for token, span in zip(tokens, spans): start, end = span true_start = start + offset true_end = end + offset tokens_and_spans.append((token, (true_start, true_end))) return tokens_and_spans
def tokenize(sents): """Identifica los tokens del las oraciones de entrada Returns: Una lista de oraciones. Cada oración es una lista de tokens """ tokenizer = TreebankWordTokenizer() sent_tokens = [tokenizer.tokenize(sent) for sent in sents] return sent_tokens
def tokenize(review: str) -> list: """Tokenize string based on NLTK TreebankWordTokenizer. Args: review: The raw review content. Returns: A list of tokens found by the NLTK tokenizer. """ tokenizer = TreebankWordTokenizer() return tokenizer.tokenize(review)
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str :param model: tokenizer object to used # Should be in init? :type model: object """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
def __init__(self, sentence_tokenizer: Any = None, paragraph_threshold: int = 150): """ Constructor :param sentence_tokenizer: a sentences_tokenizer that provide a tokenize(t:str)->[str] method (for instance: nltk.data.load('tokenizers/punkt/english.pickle')) :param paragraph_threshold: the minimum number of characters of paragraph should contains (it will be filtered otherwise) """ self.sentences_tokenizer = sentence_tokenizer # might wanna use self.paragraph_threshold = paragraph_threshold self._word_tokenizer = TreebankWordTokenizer()
def __init__(self): self.tokenizer = TreebankWordTokenizer() self.tokenizer.PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\<\>]|[\{\}]+'), r' \g<0> ') # See discussion on https://github.com/nltk/nltk/pull/1437 # Adding to TreebankWordTokenizer, the splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' improved_open_quote_regex = re.compile(u'([«“‘])', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) self.tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) self.tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) self.tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
def __init__(self, *args, **kwargs): if 'tokenize' in kwargs: raise TypeError( 'TreebankEncoder defines a tokenize callable TreebankWordTokenizer' ) try: import nltk # Required for moses nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise self.detokenizer = TreebankWordDetokenizer() super().__init__(*args, **kwargs, tokenize=TreebankWordTokenizer().tokenize)
def word_tokenize(text, language="spanish"): """ It splits the text into words Args: text: text to be splited language: language of the tokenizer to be used Returns: List of words """ #try to use from local try: from nltk.tokenize.treebank import TreebankWordTokenizer _treebank_word_tokenize = TreebankWordTokenizer().tokenize return [ token for sent in sent_tokenize(text) for token in _treebank_word_tokenize(sent) ] #if not, use nltk except IOError: from nltk import word_tokenize return word_tokenize(text, language)
class TreebankSpanTokenizer(TreebankWordTokenizer): def __init__(self): self._word_tokenizer = TreebankWordTokenizer() def span_tokenize(self, text): ix = 0 for word_token in self.tokenize(text): ix = text.find(word_token, ix) end = ix + len(word_token) yield (ix, end) ix = end def tokenize(self, text, withSpans=False): tokens = self._word_tokenizer.tokenize(text) if not withSpans: return tokens spans = [] ix = 0 for word_token in tokens: ix = text.find(word_token, ix) end = ix + len(word_token) spans.append((ix, end)) ix = end return zip(tokens, spans)
def term_frequency(sentence, ngrams=4): """Given a sentence, calculates term frequency of tuples. Parameters ---------- sentence : str Sentence whose term frequency has to be calculated. ngrams : int Number of n-grams for which term frequency is calculated. Returns ------- dict {tuple : int} key-value pairs representing term frequency. """ sentence = sentence.lower().strip() for punc in PUNCTUATIONS: sentence = sentence.replace(punc, "") words = TreebankWordTokenizer().tokenize(sentence) counts = {} for i in range(ngrams): for j in range(len(words) - i): ngram = tuple(words[j:(j + i + 1)]) if ngram in counts: counts[ngram] += 1 else: counts[ngram] = 1 return counts
def __init__(self, data, tokenizer): self._text = to_unicode(data).strip() self._tokenizer = tokenizer self._treebank_word_tokenize = TreebankWordTokenizer().tokenize self.formdocument() self.extractsentences() self.extractwords()
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [ word for word in TreebankWordTokenizer().tokenize(line.lower()) if word not in stopwords ] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) # print(documents) print len(documents), "documents read" print len(self.dictionary), " unique tokens", self.dictionary
class Dictionary: def __init__(self, tokenizer_method: str = "TreebankWordTokenizer"): self.token2idx = {} self.tokenizer = None if tokenizer_method == "TreebankWordTokenizer": self.tokenizer = TreebankWordTokenizer() else: raise NotImplementedError( "tokenizer_method {} doesn't exist".format(tokenizer_method)) self.add_token(UNK_TOKEN) # Add UNK token def build_dictionary_from_captions(self, captions: List[str]): for caption in captions: tokens = self.tokenizer.tokenize(caption) for token in tokens: self.add_token(token) def size(self) -> int: return len(self.token2idx) def add_token(self, token: str): if token not in self.token2idx: self.token2idx[token] = len(self.token2idx) def lookup_token(self, token: str) -> int: if token in self.token2idx: return self.token2idx[token] return self.token2idx[UNK_TOKEN]
def __init__(self, modelID, inputXMLfilepath= "", modelType="", title="", objects=[]): ''' Constructor @param modelID: identifier of the model @param inputXMLfilepath: path to the input XML file containing the model if this parameter is left empty a new XML tree is created @param type: KAOS, TROPOS, or any other kind of model ''' self.textFilter = TextFilter() self.wordTokenizer = TreebankWordTokenizer() self.maxID = "100" #@todo: we have to set the current maximum to the actual maximum value #for the model self.modelInfo = ModelInfo(modelID) if not inputXMLfilepath == "": self.modelInfo.setLocation(inputXMLfilepath) self.tree = ET.parse(self.modelInfo.getLocation()) self.__loadModelInfo(self.modelInfo) self.modelGoals = self.__loadModelGoals() self.modelWords = self.__loadModelWords() self.modelStems = self.__loadModelStems() else: attributes = dict() attributes['type'] = modelType attributes['title'] = title attributes['object'] = objects root = Element("MODEL", attributes) self.tree = ElementTree(root)
def __init__(self): filename = 'Models/CRF_crfsuite_dict.crfsuite' self.crf_model = pycrfsuite.Tagger() self._treebank_word_tokenizer = TreebankWordTokenizer() country_file = open("Dictionaries/Countries.txt",'r', encoding='utf-8') self.dictionary_country = country_file.readlines() self.dictionary_country = set([line[:-1] for line in self.dictionary_country]) city_file = open("Dictionaries/Cities.txt",'r', encoding='utf-8') self.dictionary_city = city_file.readlines() self.dictionary_city = set([line[:-1] for line in self.dictionary_city]) first_name_file = open("Dictionaries/dictionary_first_names.txt", 'r', encoding='utf-8') self.dictionary_first_name = first_name_file.readlines() self.dictionary_first_name = set([line[:-1].lower() for line in self.dictionary_first_name]) surname_file = open("Dictionaries/dictionary_surnames.txt", 'r', encoding='utf-8') self.dictionary_surname = surname_file.readlines() self.dictionary_surname = set([line[:-1].lower() for line in self.dictionary_surname]) if os.path.exists(filename): self.crf_model.open('Models/CRF_crfsuite_dict.crfsuite') else: self.crf_model = None self.dictionary_job_titles = [] with open('Dictionaries/job_title_dictionary.csv', encoding='utf-8') as csv_file: csv_reader = csv.reader(csv_file,delimiter=',') for row in csv_reader: if row[2]=='assignedrole': candidates = row[0].lower().split(' ') for can in candidates: if len(can)>2: self.dictionary_job_titles.append(can) self.dictionary_job_titles = set(self.dictionary_job_titles) pass
def word_tokenize_by_string(note): translator = str.maketrans('', '', string.punctuation) _treebank_word_tokenizer = TreebankWordTokenizer() note = note.translate(translator) note = note.replace('0','#') note = note.replace('1','#') note = note.replace('2','#') note = note.replace('3','#') note = note.replace('4','#') note = note.replace('5','#') note = note.replace('6','#') note = note.replace('7','#') note = note.replace('8','#') note = note.replace('9','#') tokenized_note = _treebank_word_tokenizer.tokenize(note) return tokenized_note
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset): """Generate syntactically similar sentences for each sentence in the dataset. For PaInv-Replace Returns dictionary of original sentence to list of generated sentences """ # Use nltk treebank tokenizer and detokenizer tokenizer = TreebankWordTokenizer() detokenizer = TreebankWordDetokenizer() # Stopwords from nltk stopWords = list(set(stopwords.words('english'))) # File from which sentences are read file = open(dataset, "r") # when we use Bert berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased') bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased') bertmodel.eval() # Number of perturbations you want to make for a word in a sentence dic = {} num_of_perturb = 50 num_sent = 0 for line in file: s_list = line.split("\n") source_sent = s_list[0] # Generating new sentences using BERT new_sents = perturb(source_sent, bertmodel, num_of_perturb) dic[line] = new_sents if new_sents != []: num_sent += 1 return dic
def __init__(self, modelIndexManager): ''' @param modelIndex: reference to the place where the models are indexed ''' self.textFilter = TextFilter() self.modelIndexManager = modelIndexManager self.wordTokenizer = TreebankWordTokenizer() self.tRecommender = TransformationRecommender()
def tokenize(self, text: str, split_enclitics:list = ['ne', 'n', 'que', 've', 'ue', 'st'], split_words:list = []): """ :rtype: list :param text: text to be tokenized into sentences :type text: str :param model: tokenizer object to used # Should be in init? :type model: object """ if self._latin_replacements: split_words = self._latin_replacements if split_words: text = self._replace_patterns(text, split_words) sents = self.sent_tokenizer.tokenize(text) if split_enclitics: sents = self._split_enclitics(sents, split_enclitics) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
class TransformationRecommender(object): ''' This class recommends a transformation according to the model information ModelInfo object and the query issued ''' ''' This object is a Singleton, since it does not have private data but only functions: the code below defines a singleton ''' _instance = None def __new__(cls, *args, **kwargs): if not cls._instance: cls._instance = super(TransformationRecommender, cls).__new__( cls, *args, **kwargs) return cls._instance def __init__(self): self.tf = TextFilter() self.wordTokenizer = TreebankWordTokenizer() def getRecommendedTransformation(self, modelInfo, query): ''' If the input sentence is the same as the title except than in the title part that specifies the object, then "object change" shall be suggested ''' title = modelInfo.getName() titleFiltered = self.tf.filter_all_except_stem(title) titleToks = self.wordTokenizer.tokenize(titleFiltered) titleToksNoObj = [t for t in titleToks if t not in modelInfo.getObjects()] queryFiltered = self.tf.filter_all_except_stem(query) sentenceToks = self.wordTokenizer.tokenize(queryFiltered) if set(titleToksNoObj).issubset(sentenceToks): return OBJECT_CHANGE else: return ''
def testset_read(fn, word_idx, maxlen): total_num_of_unk = 0 tokenizer = TreebankWordTokenizer() try: lines = codecs.open(fn, encoding='utf8').read().splitlines() except UnicodeDecodeError: lines = codecs.open(fn).read().splitlines() X = [] sentences = [] for line in lines: s = [] for token in tokenizer.tokenize(line): idx = word_idx.get(token, 1) # 1 is UNKNOWN word id if idx == 1: total_num_of_unk += 1 s.append(idx) X.append(s) sentences.append(line) X = sequence.pad_sequences(X, maxlen=maxlen) print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences))) return X, sentences
class TreebankSpanTokenizer(TreebankWordTokenizer): def __init__(self): self._word_tokenizer = TreebankWordTokenizer() def span_tokenize(self, text): ix = 0 for word_token in self.tokenize(text): ix = text.find(word_token, ix) end = ix+len(word_token) yield (ix, end) ix = end def tokenize(self, text): return self._word_tokenizer.tokenize(text);
def __init__(self): self.tf = TextFilter() self.wordTokenizer = TreebankWordTokenizer()
def __init__(self): self._word_tokenizer = TreebankWordTokenizer()
class RequirementsModel(object): ''' This class embeds the information residing in the XML of a requirements model passed as input parameter during construction ''' def __init__(self, modelID, inputXMLfilepath= "", modelType="", title="", objects=[]): ''' Constructor @param modelID: identifier of the model @param inputXMLfilepath: path to the input XML file containing the model if this parameter is left empty a new XML tree is created @param type: KAOS, TROPOS, or any other kind of model ''' self.textFilter = TextFilter() self.wordTokenizer = TreebankWordTokenizer() self.maxID = "100" #@todo: we have to set the current maximum to the actual maximum value #for the model self.modelInfo = ModelInfo(modelID) if not inputXMLfilepath == "": self.modelInfo.setLocation(inputXMLfilepath) self.tree = ET.parse(self.modelInfo.getLocation()) self.__loadModelInfo(self.modelInfo) self.modelGoals = self.__loadModelGoals() self.modelWords = self.__loadModelWords() self.modelStems = self.__loadModelStems() else: attributes = dict() attributes['type'] = modelType attributes['title'] = title attributes['object'] = objects root = Element("MODEL", attributes) self.tree = ElementTree(root) def __loadModelInfo(self, modelInfo): ''' This function load the name of the model from the "title" field of the MODEL tag, together with the type and the objects, and stores these information in the ModelInfo object ''' root = self.tree.getroot() modelInfo.setName(self.textFilter.lower_all(root.get("title"))) modelInfo.setType(self.textFilter.lower_all(root.get("type"))) objects = root.get("object").strip().split(OBJ_SEPARATOR) lowercaseObjects = [self.textFilter.lower_all(o) for o in objects] modelInfo.setObjects(lowercaseObjects) def __loadModelGoals(self): ''' The function loads the goal names included in the model and returns a list with all the goals of the model. The goals names are stored as lowercase goals ''' root = self.tree.getroot() goalNames = list() for child in root.iter('ENTITY'): if child.attrib['type'] == 'goal': goalNames.append(self.textFilter.lower_all(child.attrib['name'])) return goalNames def __loadModelWords(self): ''' The function loads the words included in the model and returns a dictionary with all the words of the model and their frequency ''' tokenizedWords = dict() if not self.modelGoals == None: for name in self.modelGoals: nameFiltered = self.textFilter.filter_all_except_stem(name) words = self.wordTokenizer.tokenize(nameFiltered) for word in words: if not tokenizedWords.has_key(word): tokenizedWords[word] = 1 else: tokenizedWords[word] = tokenizedWords[word] + 1 return tokenizedWords def __loadModelStems(self): ''' The function loads the stems included in the model and returns a dictionary with all the stems of the model and their frequency ''' tokenizedStems = dict() if not self.modelWords == None: for w in self.modelWords.keys(): stem = self.textFilter.filter_all(w) if not tokenizedStems.has_key(stem): tokenizedStems[stem] = self.modelWords[w] else: tokenizedStems[stem] = tokenizedStems[stem] + self.modelWords[w] return tokenizedStems def __getModelStems(self): return self.modelStems.keys() def __getModelWords(self): return self.modelWords.keys() def __getModelGoals(self): return self.modelGoals def __getModelStemsAndFreq(self): return self.modelStems def __getModelWordsAndFreq(self): return self.modelWords def getModelInfo(self): return self.modelInfo def getModelID(self): return self.modelInfo.getId() def getModelKeys(self, keyType): if keyType == STEM_STRING: return self.__getModelStems() if keyType == WORD_STRING: return self.__getModelWords() if keyType == GOAL_STRING: return self.__getModelGoals() def getModelKeysAndFrequencies(self, keyType): if keyType == STEM_STRING: return self.__getModelStemsAndFreq() if keyType == WORD_STRING: return self.__getModelWordsAndFreq() if keyType == GOAL_STRING: return dict(zip(self.__getModelGoals()), [1] * (len(self.__getModelGoals())) ) def changeTitle(self, newTitle): ''' This function shall change the title of the model, which means changing the modelInfo and the XML of the model ''' #self.modelInfo.setName(newTitle) root = self.tree.getroot() root.set("title", newTitle) self.__loadModelInfo(self.modelInfo) #the function updates the modelInfo structure def changeObjects(self, newObjectsList): ''' This function shall change the objects of the model, which means changing the modelInfo but also the XML of the model ''' root = self.tree.getroot() newObjects = ' ,'.join([o for o in newObjectsList]) root.set("object", newObjects) self.__loadModelInfo(self.modelInfo) def changeGoalName(self, goalID, newGoalName): ''' @param goalID: ID of the goal that shall have a new name @param newGoalName: string representing the new name of the goal ''' root = self.tree.getroot() for child in root.iter('ENTITY'): if child.attrib['type'] == 'goal' and child.attrib['id'] == goalID: child.attrib['name'] = newGoalName def searchGoalByName(self, goalName): ''' @param goalName: name of the goal to be searched return: goalID, which is the unique ID of the goal, if the goal exist -1, if the goal is not found ''' root = self.tree.getroot() for child in root.iter('ENTITY'): if child.attrib['type'] == 'goal' and child.attrib['name'] == goalName: return child.attrib['id'] return -1 def searchGoalsBySubstring(self, goalSubstring, caseSensitive = "NO"): ''' @param goalSubstring: a substring that shall be searched among the goal names. By default the search is not case sensitive return: a list with the couples [ID, goalName] of the goals that include the @param goalSubstring ''' root = self.tree.getroot() goalDict = dict() for child in root.iter('ENTITY'): if child.attrib['type'] == 'goal': if caseSensitive == "NO": if self.textFilter.lower_all(goalSubstring) in self.textFilter.lower_all(child.attrib['name']): goalDict[child.attrib['id']] = child.attrib['name'] else: if goalSubstring in child.attrib['name']: goalDict[child.attrib['id']] = child.attrib['name'] return goalDict def __assignUniqueIDs(self, treeRoot): ''' This function assigns unique IDs to all the objects of type ENTITY in @param tree ''' currentMaxId = self.maxID for child in treeRoot.iter('ENTITY'): currentMaxId = str( int(currentMaxId) + 1 ) child.attrib['id'] = currentMaxId self.maxID = currentMaxId def insertTree(self, parentID, childTree): ''' Given a @param childTree, which is a tree or a node, this is added as a child of parentID below the first refinement of the parent. The assumption here is that each parent can have ONLY ONE TYPE of refinement. The unique IDs to the child elements are dynamically assigned by the function. The childTree could be also a single node. ''' root = self.tree.getroot() for child in root.iter('ENTITY'): if child.attrib['id'] == parentID: refinement = child.findall("REFINEMENT") if refinement and len(refinement) == 1: #ONLY ONE TYPE of refinement is allowed for each element self.__assignUniqueIDs(childTree) refinement[0].append(childTree) return def saveModelAs(self, destinationFilePath): ''' @param destinationFilePath: path of the file where the model shall be saved. @todo: currently the model is saved to another location and the original location is lost. Therefore, the model currently keeps the same ID. We have to change this behaviour. ''' self.modelInfo.setLocation(destinationFilePath) self.saveModel() def saveModel(self): ''' Save the model in the same destination as the input folder and with the original name ''' try: self.tree.write(self.modelInfo.getLocation()) except IOError: print "IOError: Saving to a path that does not exist! Use saveModelAs() instead" except: print "An error occurred"
def __init__(self): ''' Constructor ''' self.stopwords = stopwords.words('english') self.wordTokenizer = TreebankWordTokenizer()
from nltk import ne_chunk,pos_tag from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.treebank import TreebankWordTokenizer ''' import nltk nltk.download('words') nltk.download('punkt') nltk.download('maxent_treebank_pos_tagger') nltk.download('maxent_ne_chunker') ''' TreeBankTokenizer = TreebankWordTokenizer() PunktTokenizer = PunktSentenceTokenizer() text = ''' The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division of the Eastern Conference. Founded in 1946, the team is currently owned by Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden, which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL. The Celtics have dominated the league during the late 50's and through the mid 80's, with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, Larry Bird and legendary Celtics coach Red Auerbach, combined for a 795 - 397 record that helped the Celtics win 16 Championships. ''' sentences = PunktTokenizer.tokenize(text) tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences] tagged = [pos_tag(token) for token in tokens] chunked = [ne_chunk(taggedToken) for taggedToken in tagged]
#!/usr/bin/env python from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk import pos_tag, ne_chunk, Tree import re _word_tokenizer = TreebankWordTokenizer() _stemmer = PorterStemmer() _lemmatizer = WordNetLemmatizer() text = "At 12:35 a.m. ET (1735 GMT) the Dow Jones industrial average .DJI was up 211.89 points, or 1.31 percent, at 16,363.3." # tokenize sentence cleaned_sentence = re.sub(r'\W', ' ', text) tokens = _word_tokenizer.tokenize(cleaned_sentence) tokens_stemmed = [_stemmer.stem(word.lower()) for word in tokens] print tokens_stemmed
def __init__(self): ''' Constructor ''' self.stopwords = open(stopwords_file, 'r').read().split() self.wordTokenizer = TreebankWordTokenizer()
def consistency(s): """ >>> m = consistency("Batch gradient descent algorithms " ... "... in Batch Gradient Descent ...") >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2] [('gradientdescent', ['Gradient Descent', 'gradient descent'])] >>> m = consistency("This sentence's first word appears uncapitalized in " ... "this sentence. Hadoop should be capitalized as " ... " Hadoop, not hadoop.") >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2] [('hadoop', ['Hadoop', 'hadoop'])] If the second word of a sentence is capitalized, it will be be considered if and only if the following word is uncapitalized: >>> m = consistency("The Operator may be replaced by another operator") >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2] [('operator', ['Operator', 'operator'])] >>> m = consistency("The Operator Descriptor describes an operator") >>> [(x, sorted(y)) for (x, y) in m.items() if len(y) >= 2] [] """ sent_tokenizer = PunktSentenceTokenizer() tokenizer = TreebankWordTokenizer() mappings = defaultdict(set) sentences = sent_tokenizer.tokenize(s) for sent in sentences: tokens = tokenizer.tokenize(sent) # The capitalization of individual words poses a problem: we would like # to detect cases where names are miscapitalized (e.g. hadoop instead # of Hadoop), but we want to avoid false-positives due to capitalized # words that start a sentence or are part of capitalized phrases. # Therefore, we only add mappings for capitalized unigrams if they do # not start a sentence and are not adjacent to other capitalized words. for i in range(1, len(tokens)): prev_token = tokens[i-1] token = tokens[i] if i+1 < len(tokens): next_token = tokens[i+1] else: next_token = "" adjacent_uppercase = (i > 1 and is_uppercase(prev_token)) or \ is_uppercase(next_token) if is_uppercase(token) and adjacent_uppercase: continue norm = canonicalize(token) source = token.strip(",. ") mappings[norm].add(source) # Map normalized ngrams for x in range(2, MAX_PHRASE_LENGTH+1): for ngram in ngrams(tokens, x): norm = canonicalize(ngram) source = " ".join(ngram).strip(",. ") if len(source.split()) == x: mappings[norm].add(source) # For normalized forms with mutiple values, filter out longer ngrams that # may be covered by shorter ones or that are trivial capitalization # differences for (key, values) in mappings.items(): if len(values) > 1: for (a, b) in bigrams(values): (x, y) = [" ".join(x) for x in strip_common_fixes(a.split(), b.split())] if (x, y) != (a, b): del mappings[key] break else: del mappings[key] return mappings
class TextFilter(object): ''' This class filter a text providing typical IR functions such as stop words removal, stemming and so forth ''' def __init__(self): ''' Constructor ''' self.stopwords = open(stopwords_file, 'r').read().split() self.wordTokenizer = TreebankWordTokenizer() def remove_stopwords(self, string_text): """ The function takes a string as input and returns a string without the stopwords """ tokens = self.wordTokenizer.tokenize(string_text) filteredtext = ' '.join([t for t in tokens if t.lower() not in self.stopwords]) return filteredtext def __remove_item_from_term(self, term, item): """ remove charachter @param item from the @param term """ return ''.join([c for c in term if c != item]) def remove_item(self, string_text, item): """ remove charachter @param item from the string """ tokens = self.wordTokenizer.tokenize(string_text) filteredtext = ' '.join([self.__remove_item_from_term(t, item) for t in tokens]) return filteredtext def stem_words(self, string_text): """ The function takes a string as input and returs a string with stemmed words """ tokens = self.wordTokenizer.tokenize(string_text) stemmer = PorterStemmer() stemmedtext = ' '.join([stemmer.stem(t) for t in tokens]) return stemmedtext def remove_punct(self, string_text): """ The function takes a string as input and returns the same string without punctuation """ nopunct_text = ''.join([c for c in string_text if re.match("[a-zA-Z\-\' \n\t]", c)]) return nopunct_text def lower_all(self, string_text): """ Reduce each term in @param string_text to lowecase """ tokens = self.wordTokenizer.tokenize(string_text) lowercase_string = ' '.join([t.lower() for t in tokens]) return lowercase_string def remove_single_char(self, string_text): """ remove single char items from @param string_text """ tokens = self.wordTokenizer.tokenize(string_text) no_single_char_string = ' '.join([t for t in tokens if len(t) > 1]) return no_single_char_string def filter_all(self, string_text): """ executes all the filter functions on @param string_text @param string_text: input text """ sentence_no_punct = self.remove_punct(string_text) sentence_no_single_char = self.remove_single_char(sentence_no_punct) sentence_no_stopwords = self.remove_stopwords(sentence_no_single_char) filtered_sentence = self.stem_words(sentence_no_stopwords) filtered_sentence = self.lower_all(filtered_sentence) #filtered_sentence = self.lower_all(sentence_no_single_char) return filtered_sentence
class QueryManager(object): ''' Given a specification query, this object returns a set of models together with possible transformations that can be applied to the model to address the satisfy the specification query ''' def __init__(self, modelIndexManager): ''' @param modelIndex: reference to the place where the models are indexed ''' self.textFilter = TextFilter() self.modelIndexManager = modelIndexManager self.wordTokenizer = TreebankWordTokenizer() self.tRecommender = TransformationRecommender() def __parseQuery(self, queryString): ''' This function returns the words included in queryString, after filtering all the stopwords, performing stemmming and applying all the filters provided by textFilter @param queryString: the specification query in the form of a string ''' filteredQueryString = self.textFilter.filter_all(queryString) return self.wordTokenizer.tokenize(filteredQueryString) def issueQuery(self, queryString): ''' This is the main function of this class. Given the specification query, the function parses the specification and returns a set of QueryResult objects, which include the link to the models @param queryString: the specification query in the form of a string @return: a list of QueryResult objects. ''' qr = list() stems = self.__parseQuery(queryString) for stem in stems: modelsInfos = self.modelIndexManager.searchModels(stem, STEM_STRING) #modelsTransformationsList = [(model, "object change") for model in models] #results[stem] = modelsTransformationsList if not modelsInfos == None: for modelInfo in modelsInfos: score = 0.1 transformation = self.tRecommender.getRecommendedTransformation(modelInfo, queryString) qr.append(QueryResult(modelInfo, [transformation], score)) qr.sort(key=lambda x: x.score) #the list is ordered by the score attribute and reversed qr.reverse() ''' @todo: for each model we shall understand which is the best transformation. To this end, an additional class is required. Currently, we always add the object change transformation together with each model found. ''' return qr
from collections import defaultdict from glob import glob from string import strip import sys from nltk.tokenize.treebank import TreebankWordTokenizer VERBS = set(["geben", "helfen", "sagen", "machen", "arbeiten", "bringen"]) tokenizer = TreebankWordTokenizer() if __name__ == "__main__": pattern = sys.argv[1] sentences = defaultdict(list) for ii in glob(pattern): for jj in map(strip, open(ii)): if any(jj.endswith("%s." % verb) for verb in VERBS): sentence = jj.split(".")[-2] words = tokenizer.tokenize(sentence) sentences[words[-1]].append(words) good_verbs = [x for x in sentences if len(sentences[x]) > 20] test_file = open("german/test.txt", 'w') print("COUNT:") for ii in good_verbs: print("%s\t%i" % (ii, len(sentences[ii])))
query = neo4j.CypherQuery(graph_db, cypherQuery) for record in query.stream(): if len(record) > 1: # a relationship exists break print "The answer is..." print answer_node raw_input ('Press enter to ask more') question() #Creating a Knowledge Graph sort of thing graph_db = neo4j.GraphDatabaseService() batch = neo4j.WriteBatch(graph_db) TreeBankTokenizer = TreebankWordTokenizer() PunktTokenizer = PunktSentenceTokenizer() filename = raw_input("Enter file name\n") f = open(filename,'rU') raw = f.read() #normalize text for p in string.punctuation: if p != ',' : raw = raw.replace(p, '') raw = raw.strip() #IN = re.compile(r'.*\bin\b(?!\b.+ing)') tokens = []
def question(): # question asking part qText = raw_input('Enter a question...\n') graph_db = neo4j.GraphDatabaseService() batch = neo4j.WriteBatch(graph_db) TreeBankTokenizer = TreebankWordTokenizer() PunktTokenizer = PunktSentenceTokenizer() qIdentifiers = { "What": ' ', "Who": 'PERSON', "Where": 'GPE', "When": 'TIME', "Why":'', "How":'' } entities = [] tokens = [] for sentence in PunktTokenizer.tokenize(qText): chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence))) for chunk in chunks: if hasattr(chunk,'node'): tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))]) tokens.append(tmp_tree) else: tokens.append(chunk[0]) entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')]) #print chunks #print tokens #entities dict entities_dict = {} for entity in entities: leaves = entity.leaves() if len(leaves) > 1 : entities_dict[entity.leaves()[0][0]+' '+entity.leaves()[1][0]] = entity.node else : entities_dict[entity.leaves()[0][0]] = entity.node #print entities_dict # Q&A answering algorithm # Find the type of question qId = '' for key in qIdentifiers.keys(): if key in str(qText): #remove key from text qText = qText.split(key)[1] print qText qId = qIdentifiers[key] # Find what kind of answer is required answerType = qId # find relation closese to the question text maximum = 0.0 queryRel = '' for rel in relations.keys(): # do string comparison #score = stringcomp(str(qText),str(relations[int(rel)])) score = SequenceMatcher(None,str(qText),str(relations[int(rel)])).ratio() if score > maximum : maximum = score queryRel = "`"+str(rel)+"`" #print queryRel # Find start node try: start_node = entities_dict.keys()[0] except Exception, err: print 'No entity found in the question' question()
def main(sysargs): sys.argv = sysargs arg_parser= argparse.ArgumentParser(description='Formats debates by removing HTML and filtering words.') arg_parser.add_argument('-i', '--infile', required=True, help='Debate file to format.') args = arg_parser.parse_args() # Initialize nltk elements. parser = SpeechHTMLParser() sent_splitter = PunktSentenceTokenizer() tokenizer = TreebankWordTokenizer() tagger_loc = '/het/users/jengi/stanford-postagger/' tagger = StanfordTagger(tagger_loc + 'models/wsj-0-18-bidirectional-distsim.tagger', \ tagger_loc + 'stanford-postagger.jar') stemmer = SnowballStemmer('english') # Read infile. speaker_pattern = re.compile('.*:') null_pattern = re.compile('\s*(\[[^\]]*\]|\([^\)]*\))') dash_pattern = re.compile('\S+(--)\s+') ellipse_pattern = re.compile('\s*\.\.\.\s*') noun_tags = ['NN', 'NNS', 'NNP', 'NNPS'] punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', \ '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', \ '\\', ']', '^', '_', '`', '{', '|', '}', '~'] block_lengths = [] with open(args.infile, 'r') as afile: file_contents = afile.read() parser.feed(file_contents) parser.close() num_blocks = 0 speeches = {} for (speaker, block) in parser.text: if num_blocks % 10 == 0: print >> sys.stderr, 'Processing block ' + str(num_blocks) + ' ...' orig_block = block # Remove applause, laughter, etc. block = repeated_search(block, null_pattern, 0) # Remove -- from the end of words. (Indicates stuttering / stopping.) block = repeated_search(block, dash_pattern, 1) # Do more complex tokenization. sents = sent_splitter.tokenize(block) sents = [ellipse_pattern.sub(' ... ', sent) for sent in sents] tokens = [tokenizer.tokenize(sent) for sent in sents] # Run POS tagger and keep only nouns. # Also lowercase and stem these nouns. tags = [tagger.tag(toks) for toks in tokens] tokens = [] tagged_text = []; for sent in tags: tokens.append([]) for (word, tag) in sent: tagged_text.append(word); tagged_text.append(tag); if tag in noun_tags: tokens[len(tokens) - 1].append(stemmer.stem(word.lower())) # Remove any "sentences" that are actually empty and # any tokens that are pure punctuation. for i in reversed(range(len(tokens))): for j in reversed(range(len(tokens[i]))): non_punct = ''.join([tok for tok in tokens[i][j] if tok not in punct]) if len(non_punct) == 0: del tokens[i][j] if len(tokens[i]) == 0: del tokens[i] # Make sure there is still at least one sentence left. num_sents = len(tokens) if num_sents == 0: continue # Add block to speeches dictionary. speaker = speaker[:speaker_pattern.match(speaker).end() - 1] if speaker not in speeches: speeches[speaker] = [] speeches[speaker].append(orig_block) speeches[speaker].append(' '.join(tagged_text)) speeches[speaker].append('\n'.join([' '.join(sent) for sent in tokens])) #print speeches[speaker][0] #print speeches[speaker][1] #print speeches[speaker][2] num_blocks += 1 num_tokens = 0 for toks in tokens: num_tokens += len(toks) block_lengths.append(num_tokens) # Save each speaker's text to a file. (infolder, basename) = os.path.split(os.path.abspath(args.infile)) out_prefix = infolder + '/' out_suffix = basename for speaker in speeches: # Create outfile prefixed by speaker's name. outfile = open(out_prefix + speaker + '-' + out_suffix, 'w') # Save text to outfile. blocks = speeches[speaker] for i in range(0, len(blocks), 3): print >> outfile, blocks[i] print >> outfile, blocks[i + 1] print >> outfile, blocks[i + 2] print >> outfile outfile.close() print '# of blocks: ' + str(num_blocks) print 'Mean # of tokens (per block): ' + str(scipy.mean(block_lengths)) print 'Median # of tokens: ' + str(scipy.median(block_lengths)) print 'Standard deviation in # of tokens: ' + str(scipy.std(block_lengths))
def main(): text = raw_input('Enter a question...\n') print text graph_db = neo4j.GraphDatabaseService() batch = neo4j.WriteBatch(graph_db) TreeBankTokenizer = TreebankWordTokenizer() PunktTokenizer = PunktSentenceTokenizer() qIdentifiers = { "What": ' ', "Who": 'PERSON', "Where": 'GPE', "When": 'TIME', "Why":'', "How":'' } entities = [] tokens = [] for sentence in PunktTokenizer.tokenize(text): chunks = ne_chunk(pos_tag(TreeBankTokenizer.tokenize(sentence))) for chunk in chunks: if hasattr(chunk,'node'): tmp_tree = nltk.Tree(chunk.node, [(''.join(c[0] for c in chunk.leaves()))]) tokens.append(tmp_tree) else: tokens.append(chunk[0]) entities.extend([chunk for chunk in chunks if hasattr(chunk,'node')]) #print chunks print tokens #entities dict entities_dict = {} for entity in entities: leaves = entity.leaves() if len(leaves) > 1 : entities_dict[entity.leaves()[0][0]+entity.leaves()[1][0]] = entity.node else : entities_dict[entity.leaves()[0][0]] = entity.node print entities_dict class doc():pass doc.headline=[''] doc.text = tokens # Q&A answering algorithm # Find the type of question qId = '' for key in qIdentifiers.keys(): if key in str(text): print key qId = qIdentifiers[key] # Find what kind of answer is required answerType = qId # Find start node start_node = entities_dict.keys()[0] start_node_type = entities_dict[start_node] # Run string similarity between relation text and question text # for the time being reading from the file # Build query cypherQuery = "START me=node:objects(name='" + start_node + "') MATCH me-[r]->obj RETURN r,obj.name LIMIT 10 " # Start Graph traversal query = neo4j.CypherQuery(graph_db, cypherQuery) for record in query.stream(): print 'printing records' print record[0] print record[1] print '\n'