def frequencyGenerator(self, s): pat = '[0-9|.| |\-|\[|\]|-|!|,|\\n|\\|/|:|"|(|)|=|<|>|@|\'|#]' tokenizer= RegexpTokenizer(pat,gaps=True) allWords = tokenizer.tokenize(s) #stemmer = WordnetStemmer() allWords = map(lambda x:x.lower().strip(),allWords) #allWordsStemmed = map(lambda x: stemmer.lemmatize(x),allWords) #del(allWords) allWords = self.cleanedWords(allWords) allWordsStemmed = allWords allWordsStemmed = filter(lambda x:len(x)>2,allWordsStemmed) #allWordsStemmed = filter(lambda x:len(x)>2,map(lambda x: stemmer.lemmatize(x),allWords)) dic={} for i in allWordsStemmed: if dic.has_key(i): dic[i] = dic[i]+1 else: dic[i]= 1 st='' dic=sorted(dic.items(), key=lambda(k,v):(v,k),reverse=True) for k in dic: try: st+=str(k[0])+','+str(k[1])+',' except: pass print st
def parse(self, fname): """ Парсинг текста файла :param fname: имя файла :return: (<имя_файла>, тошнота, мошенничество) """ density, fraud = 0, 0 with codecs.open(fname, "r", encoding="utf-8") as f: text = f.read() tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+") txt_list = tknz.tokenize(text) if txt_list: for i, word in enumerate(txt_list): new_word = self.check_word(word) if new_word: txt_list[i] = new_word fraud += 1 txt_list = [ word.lower() for word in txt_list if not (word.lower() in self.sw) ] stemmer_ru = RussianStemmer() txt_list = [ stemmer_ru.stem(token.lower()) for token in txt_list if len(token) > 1 ] dict_w = Counter(txt_list) top5 = heapq.nlargest(5, dict_w, key=dict_w.get) top5_count = sum([dict_w[word] for word in top5]) density = top5_count / len(txt_list) # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать # готов обсуждать этот критерий, возможно исправить каким то образом return fname, density, fraud > 2
def tokenize(text): #This regex is edited to accept character words only. str_ = "[A-Za-z]+" regex_tokens = RegexpTokenizer(str_) tokens = regex_tokens.tokenize(text.lower()) stems = stem_tokens(tokens, WNL) return stems
def tokenize_sentence(text, preprocess=True): ''' Tokenize the given sentence and applies preprocessing if requested (conversion to lower case and digit substitution). ''' if preprocess: text = re.sub(r'\d', '9', text.lower()) tokenizer_regexp = ur'''(?ux) ([^\W\d_]\.)+| # one letter abbreviations, e.g. E.U.A. \d{1,3}(\.\d{3})*(,\d+)| # numbers in format 999.999.999,99999 \d{1,3}(,\d{3})*(\.\d+)| # numbers in format 999,999,999.99999 \d+:\d+| # time and proportions \d+([-\\/]\d+)*| # dates. 12/03/2012 12-03-2012 [DSds][Rr][Aa]?\.| # common abbreviations such as dr., sr., sra., dra. [Mm]\.?[Ss][Cc]\.?| # M.Sc. with or without capitalization and dots [Pp][Hh]\.?[Dd]\.?| # Same for Ph.D. [^\W\d_]{1,2}\$| # currency (?:(?<=\s)|^)[\#@]\w*[A-Za-z_]+\w*| # Hashtags and twitter user names -[^\W\d_]+| # clitic pronouns with leading hyphen \w+([-']\w+)*| # words with hyphens or apostrophes, e.g. não-verbal, McDonald's -+| # any sequence of dashes \.{3,}| # ellipsis or sequences of dots \S # any non-space character ''' tokenizer = RegexpTokenizer(tokenizer_regexp) return tokenizer.tokenize(text)
def tokenize(self, text): """ tokenize text into a list of Token objects :param text: text to be tokenized (might contains several sentences) :type text: str :return: List of Token objects :rtype: list(Token) """ tokens = [] if self.tokenizer_type == "SpaceTokenizer": operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+') for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer": operator = WhitespaceTokenizer() for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "PTBTokenizer": ptb_tokens = word_tokenize(text) counter = 0 for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens): new_token = Token(counter, token, span[0], span[1]) counter += 1 tokens.append(new_token) return tokens
def get_emails_sent_by_person_list(emails_df): tokenizer = RegexpTokenizer(r'(?u)\b\w\w+\b') emails_df['subject_wc'] = emails_df['subject'].map( lambda x: len(tokenizer.tokenize(x))) emails_df['content_wc'] = emails_df['content'].map( lambda x: len(tokenizer.tokenize(x))) grouped_by_people = emails_df.groupby('from').agg({ 'content': 'count', 'subject_wc': 'mean', 'content_wc': 'mean', }) grouped_by_people.rename(columns={ 'content': 'N emails', 'subject_wc': 'Subject word count', 'content_wc': 'Content word count' }, inplace=True) grouped_by_people.sort_values(by=['N emails'], ascending=False) file_path_send = file_path = os.path.join(dir_path, 'results/emails_by_person.csv') grouped_by_people.to_csv(file_path_send)
def token_words(lyric): """ in: lyric(element of row['text']) take whole lyric and convert it into list of words for analysis apply few cleaning processes tot remove punctuation & stopwords & errors(Minor focus on this) return: list of words in the lyric """ lyric = lyric.lower() """ tokenizer that will tokenize lyric('text') into words without punctuation it will split aphostrophe words into 2 seperate words but its okay as most of the time words with aphostrophe are non-main verbs(would,should,etc) non-main verbs are usually insignificant in most of the context and will be deleted e.g : would've = would ve but this is fine as we know stopwords will remove ve tweetTokenizer was producing very irregular words in lyric such as (8, numbers and was dist """ #apply tokenizer tokenizer1 = RegexpTokenizer("[a-z]+") words = tokenizer1.tokenize(lyric) #convert list of stopwords to set of stopwords for faster access en_stopwords = set(stopwords.words('english')) #we remove stopwords in words #and add few words that were in the words_lyric for cleaner process en_stopwords.add('chorus') #single letters aren't really words :) for c in ascii_lowercase: en_stopwords.add(c) words_lyric = [w for w in words if not w in en_stopwords] #postProcess of words_lyric words_lyric = postProcess(words_lyric) return words_lyric
def no_stop_tokens(self,text): tokens = [] tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') tokens += tokenizer.tokenize(text) #stemmer = nltk.stem.snowball.EnglishStemmer() #tokens = map(lambda x: stemmer.stem(x),tokens) return tokens
def create_sents(toks): wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" s = RegexpTokenizer(wordre).tokenize(toks) wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" for sentence in s: RegexpTokenizer(wordre).tokenize(sentence) return toks
def __init__(self): self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations if ver==2: self.stemmer = SnowballStemmer("english") #using stemmed version of words elif ver==1: self.stemmer = LancasterStemmer() else: self.stemmer = PorterStemmer()
def tokenize(self,text): tokens = [] wordnet_lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') tokens += tokenizer.tokenize(text) tokens = filter(lambda x: x.lower() not in STOP_WORDS and len(x) >1 ,tokens) tokens = map(lambda token: wordnet_lemmatizer.lemmatize(token), tokens) return tokens
def readreviews(filename,header=True, fieldsep="\t"): ''' Reads traing file for LDA Parameters: - filename: input filename - header: True if the header line is present; False otherwise. - fieldsep: separator Return: - a list of entry (where each entry is a tuple of ID, list of tokens, sale/nosale). ''' review_data = list() prevsid = "" #filehandle = open(filename, "r") stopwords = loadstopwords("english.stop.txt") pdsnotthere=0 title_notthere=0 tokenizer = RegexpTokenizer('[a-z]\w+') with open(filename, 'rU') as filename: filehandle = csv.reader(filename,delimiter='\t', quotechar='"') #print filehandle for line in filehandle: #lineparts = line.split(fieldsep) fields=line if header: header=False continue if len(fields)!=12: continue print len(fields) """ print len(fields) print fields """ HotelID=fields[0] hotelname=fields[1] HotelURL=fields[2] Address=fields[3] ImgURL=fields[4] Author=fields[5] Price=fields[6] location=fields[7] Title=fields[8] ReviewID=fields[9] Content=fields[10] Rating_ovarall=fields[11] review_content=Title+' '+Content review_discription=review_content words = [ token for token in tokenizer.tokenize(review_discription) if token not in stopwords ] review_data.append( words) return review_data
def tokenize(self,text): tokens = [] tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') #tokens += tokenizer.tokenize(self.title.lower()) tokens += tokenizer.tokenize(text.lower()) tokens = filter(lambda x: x not in STOP_WORDS and len(x) >1 ,tokens) #stemmer = nltk.stem.snowball.EnglishStemmer() #tokens = map(lambda x: stemmer.stem(x),tokens) return tokens
def emailExtractor(sentence, word): # https://stackoverflow.com/questions/39777806/how-to-update-nltk-package-so-that-it-does-not-break-email-into-3-different-toke pattern = r'\S+@[^\s.]+\.[a-zA-Z]+|\w+|[^\w\s]' tokeniser = RegexpTokenizer(pattern) for w in tokeniser.tokenize(sentence): if re.search('^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$', w): context["email"] = w return True return False
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def __init__(self): nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") # lemmatization self._tokenizer = RegexpTokenizer(r"\w+") self._stop_words = set(stopwords.words("english")) # self._stemmer = nltk.stem.SnowballStemmer("english") self._lemmatizer = nltk.wordnet.WordNetLemmatizer() self._vocabulary = set()
def tokenize_large_text_file(file_name, file_location): tokens_array = set() file = os.path.join(file_location, file_name) with open(file,'r') as file: for line in file: tokenizer = RegexpTokenizer('\s+', gaps=True) tokens_array.update(tokenizer.tokenize(line)) tokens_dict = {str(file_name) + " - {} tokens".format(file_name): list(tokens_array)} with open('tokens taken from - {} - .json'.format(str(file_name)), 'w') as f: json.dump(tokens_dict, f)
def calcLocation(self, token): len = self.instTab.searchFormat(token.operator) if len > 0: return len else: if token.operator == "RESW" or token.operator == "WORD": len = 3 elif token.operator == "RESB": len = int(token.operand[0]) elif token.operator == "BYTE": len = 1 elif token.operator == "LTORG": len = self.literalTab.literalCount self.literalTab.setLiteralCount(0) count = 0 for litCheck in self.literCheck: if litCheck[1:2] == 'C': len = 3 else: len = 1 self.literalTab.modifyLiteral( litCheck, TokenTable.locCount + (count * len)) count += 1 elif token.operator == "END": len = self.literalTab.literalCount self.literalTab.setLiteralCount(0) count = 0 for litCheck in self.literCheck: self.literalTab.modifyLiteral(litCheck, token.location) count += 1 elif token.operator == "EQU": if token.operand[0] == "*": len = 0 else: tokenizer = RegexpTokenizer("-", gaps=True) tokens = tokenizer.tokenize(token.operand[0]) value1 = self.symTab.search(tokens[0]) value2 = self.symTab.search(tokens[1]) len = value1 - value2 self.symTab.modifySymbol(token.label, len) len = 0 else: len = -1 return len
def getDoc_set(): tokenizer = RegexpTokenizer(r'\w+') for doc in getCorpus.corpus_doc: # print type(doc) raw = doc.lower() tokens = tokenizer.tokenize(raw) en_stop = get_stop_words("en") stopped_tokens = [i for i in tokens if i not in en_stop] p_stemmer = PorterStemmer() texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens] getCorpus.doc_set.append(texts)
def get_search_terms(request: HttpRequest): # Get any search terms tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False) search_text = str(request.GET.get('search_text', '')) # Respect quoted strings search_terms = tr.tokenize(search_text) if len(search_terms) == 0: solr_search_terms = "*" else: solr_search_terms = ' '.join(search_terms) return solr_search_terms
def get_search_terms(search_text: str): # Get any search terms tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False) # Respect quoted strings search_terms = tr.tokenize(search_text) if len(search_terms) == 0: solr_search_terms = "*" else: solr_search_terms = ' '.join(search_terms) return solr_search_terms
def __init__(self, corpus, tokenize_str, delimiter, n, max_length): self.corpus = corpus self.tokenizer = RegexpTokenizer(tokenize_str) self.delimiter = delimiter self.n = n self.max_length = max_length # use set methods to set these variables self.tokenized_corpus = [] self.startList = [] self.ngramDict = defaultdict(list) self.unigramDict = defaultdict(list) self.set_tokenized_corpus() self.set_ngrams()
def sentence_length(corpus): too_long_sentences = [] total_sentences = 0 tokenizer = RegexpTokenizer("\s+", gaps=True) articles = preprocessing(corpus) for article in articles: sentences = sent_tokenize(article) total_sentences += len(sentences) for sentence in sentences: words = tokenizer.tokenize(sentence) if (len(words) > 25): too_long_sentences.append((sentence, len(words))) return (1 - len(too_long_sentences) / total_sentences) * 100
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer()
def getNew_object(docs, newDoc_object): tokenizer = RegexpTokenizer(r'\w+') for doc in docs: # print type(doc[1]) raw = doc[1].lower() tokens = tokenizer.tokenize(raw) en_stop = get_stop_words("en") stopped_tokens = [i for i in tokens if i not in en_stop] p_stemmer = PorterStemmer() texts = {} # texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens] # print texts for i in stopped_tokens: texts[p_stemmer.stem(i).encode('utf-8')] = texts.get( p_stemmer.stem(i).encode('utf-8'), 0) + 1 newDoc_object.append(texts)
def __init__(self): tokenizer_regexp = r'''(?ux) # the order of the patterns is important!! # more structured patterns come first [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+| # emails (?:https?://)?\w{2,}(?:\.\w{2,})+(?:/\w+)*| # URLs (?:[\#@]\w+)| # Hashtags and twitter user names (?:[^\W\d_]\.)+| # one letter abbreviations, e.g. E.U.A. (?:[DSds][Rr][Aa]?)\.| # common abbreviations such as dr., sr., sra., dra. (?:\B-)?\d+(?:[:.,]\d+)*(?:-?\w)*| # numbers in format 999.999.999,999, possibly followed by hyphen and alphanumerics # \B- avoids picks as F-14 as a negative number \.{3,}| # ellipsis or sequences of dots \w+| # alphanumerics -+| # any sequence of dashes \S # any non-space character ''' RegexpTokenizer.__init__(self, tokenizer_regexp)
class tk(object): def __init__(self): self.tok = RegexpTokenizer(r'\b([a-zA-Z]+)\b') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(s) for s in self.tok.tokenize(doc)] # define the word list to be ignored stop_words = text.ENGLISH_STOP_WORDS
def prep_string(s): s = re.sub("\n", " ", s) s = re.sub("\>", " ", s) #toks = Token(TEXT=s, LOC=CharSpanLocation(0, len(s), 's')) wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" toks = RegexpTokenizer(wordre).tokenize(s) word_list = [] for tok in toks: word_list.append(tok) return word_list
def getTokenCount(description): tokens = RegexpTokenizer(r'\w+').tokenize(description) tokens = [w.lower() for w in tokens] stopwords = yaml.load(open("backend/nltk/stopwords.yaml", "r")) tokens = [w for w in tokens if not w in stopwords] tokens = [w for w in tokens if len(w) > 2] stemmer = PorterStemmer() tokens = [stemmer.stem(w) for w in tokens] tokenCount = collections.Counter(tokens) return tokenCount
def parsing(self, line): line = line[:-1] #문장 맨 뒤 '\n' 자르기 # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장 tokenizer = RegexpTokenizer("\t", gaps=True) tokens = tokenizer.tokenize(line) count = 1 for token in tokens : if count == 1 : self.instruction = token elif count == 2 : self.format = int(token) elif count == 3 : self.opcode = int(token, 16) elif count == 4 : self.numberOfOperand = int(token) else : print("[InstTable.py] parsing() error") count += 1
class tokenizer(object): def __init__(self): self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations if ver==2: self.stemmer = SnowballStemmer("english") #using stemmed version of words elif ver==1: self.stemmer = LancasterStemmer() else: self.stemmer = PorterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tokenize.tokenize(doc)]
class Extracteur_Mots(BaseEstimator, TransformerMixin): def __init__(self): self.regexp = RegexpTokenizer("[a-z][a-z']{2,}") def fit(self, comments, y=None): return self def transform(self, comments, y=None): mots = [] for c in comments: mots.append(self.regexp.tokenize(c.lower())) return mots
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def parsing(self, line): line = line[:-1] # 문장 맨 뒤 '\n' 자르기 # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장 tokenizer = RegexpTokenizer("\t", gaps=True) tokens = tokenizer.tokenize(line) count = 0 for token in tokens: count += 1 if count == 1: self.label = token elif count == 2: self.operator = token elif count == 3: opnd = token tokenizer = RegexpTokenizer(",", gaps=True) opnds = tokenizer.tokenize(opnd) i = 0 for op in opnds: self.operand.append(op) i += 1 elif count == 4: self.comment = token else: print("[TokenTable.py] parsing() error")
def summarize(docid,score_dict,pos_dict): global cnt tokenizer = RegexpTokenizer('\w+') ##creating a tokenizer to match the words snippet = "" with open(path + "//" + docid,"r") as fi: ## open the extracted text file in read mode file_text = fi.read() tokens = tokenizer.tokenize(file_text) ## tokenize the text file using the tokenizer if score_dict[docid] not in (0,-1): ## for normal phrase/word queries cnt += 1 for pos in pos_dict[docid]: ## get snippets based on identified positions from the position dictionary pos1 = abs(pos - 8) ## get the preceeding and following 8 words from the identified position in the text pos2 = pos + 8 if pos1 < 0: pos1 = 0 if pos2 > len(tokens): pos2 = len(tokens) snippet = ' '.join(tokens[pos1:pos2]) print docid,"\t",snippet ## display docid and snippet elif score_dict[docid] == -1: ## to display document ids that do not contain the negated word/phrase cnt += 1 print docid
def tokenizeText(self, text): ret = RegexpTokenizer(u"(\d+\u00B0(\.)?)|(nr\.)|(\d+/\d+/eg)|(\d+\:\d+\w*)|(\d+\.\d+\w*)+|[\w\d]+|(\s\w\.)|(\.)|\,|\t|[^ \t\n\r\f\v\w\d]") tokens = ret.tokenize(text) ntokens = [] sentence = [] i = -1 for t in tokens[:-1] : i += 1 if type(t) is StringType: t = t.decode('UTF-8') if (t.istitle() and tokens[i-1] == '.') or (regex.search(r'^\d+',t) and tokens[i+1].istitle()): ntokens.append(sentence) sentence = [t.lower().strip()] else : sentence.append(t.lower().strip()) sentence.append(tokens[-1].lower().strip()) ntokens.append(sentence) return ntokens
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT')} self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def tokenize(text, clean=True): """ Returns a list of lists of the tokens in text, separated by sentences. Each line break in the text starts a new list. :param clean: If True, performs some cleaning action on the text, such as replacing numbers for the __NUMBER__ keyword (by calling :func:`clean_text`) """ ret = [] if type(text) != unicode: text = unicode(text, 'utf-8') if clean: text = clean_text(text, correct=True) else: # replace numbers for __NUMBER__ and store them to replace them back numbers = re.findall(ur'\d+(?: \d+)*(?:[\.,]\d+)*[²³]*', text) numbers.extend(re.findall(ur'[²³]+', text)) text = re.sub(ur'\d+( \d+)*([\.,]\d+)*[²³]*', '__NUMBER__', text) text = re.sub(ur'[²³]+', '__NUMBER__', text) # clitic pronouns regexp = r'''(?ux) (?<=\w) # a letter before -(me| te| o|a|no|na|lo|la|se| lhe|lho|lha|lhos|lhas| nos| vos| os|as|nos|nas|los|las| # unless if followed by more chars lhes)(?![-\w]) # or digits or hyphens ''' text = re.sub(regexp, r'- \1', text) regexp = ur'''(?ux) # the order of the patterns is important!! ([^\W\d_]\.)+| # one letter abbreviations, e.g. E.U.A. __NUMBER__:__NUMBER__| # time and proportions [DSds][Rr][Aa]?\.| # common abbreviations such as dr., sr., sra., dra. [^\W\d_]{1,2}\$| # currency \w+([-']\w+)*-?| # words with hyphens or apostrophes, e.g. não-verbal, McDonald's # or a verb with clitic pronoun removed (trailing hyphen is kept) -+| # any sequence of dashes \.{3,}| # ellipsis or sequences of dots __LINK__| # links found on wikipedia \S # any non-space character ''' # loads trained model for tokenizing Portuguese sentences (provided by NLTK) sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle') # the sentence tokenizer doesn't consider line breaks as sentence delimiters, so # we split them manually. sentences = [] lines = text.split('\n') for line in lines: sentences.extend(sent_tokenizer.tokenize(line, realign_boundaries=True)) t = RegexpTokenizer(regexp) for p in sentences: if p.strip() == '': continue # Wikipedia cleaning if clean: # discard sentences with troublesome templates or links if any((x in p for x in ['__TEMPLATE__', '{{', '}}', '[[', ']]'])): continue new_sent = t.tokenize(p) if clean: # discard sentences that are a couple of words (it happens sometimes # when extracting data from lists). if len(new_sent) <= 2: continue elif len(numbers) > 0: # put back numbers that were previously replaced for i in xrange(len(new_sent)): token = new_sent[i] while '__NUMBER__' in token: token = token.replace('__NUMBER__', numbers.pop(0), 1) new_sent[i] = token ret.append(new_sent) return ret
return string.replace(u"\u2019", '') def stringFromHTMLParagraph(paraWithTags): paraString = '' for taggedString in paraWithTags.strings: paraString += removeApostrophe(taggedString.string) return paraString def titleFromArticleSoup(soup): titleDiv = soup.find(class_ = 'story-heading') if not titleDiv: titleDiv = soup.find(class_ = 'entry-title') return unicode(removeApostrophe(titleDiv.string)) # Set up the tokenizer and the tagger tokenizer = RegexpTokenizer(r'\w+') tagger = UnigramTagger(treebank.tagged_sents()) # Open up a redis connection redisInterface = RedisInterface() # Print status print 'Reader ONLINE' # Run the wait-execute loop while True: while not redisInterface.hasPending(): sleep(1) page = redisInterface.popPending()
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}, 'concept_boost': {'default': 3, 'type': int}, 'np_boost': {'default': 3, 'type': int}, 'sent_boost': {'default': 1, 'type': int}, 'stem_boost': {'default': 1, 'type': int}, 'runmode': {'default': 'train'}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT')} self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer() # if len(args) > 3:s # self.ttys = [] # # for tty in args[3:]: # if tty in ttygroups: # self.ttys.extend(ttygroups[tty]) # else: # self.ttys.append(tty) def expand_concept(self, cdata, synonyms=False): rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'} Okay = True for st in cdata['SemanticTypes']: if st in rejected_semTypes: Okay = False if Okay: if synonyms: return self.concept_synonyms(cdata['ConceptId']) else: return cdata['ConceptId'] def concept_synonyms(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set(filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def run(self, test_data): out_results = [] for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME if self.opts.runmode == 'eval': doc_type = doc_type.replace('train', 'eval') doc = self.doc_mod.get_doc( ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0], ann['citation_marker_offset'][1] - ann['citation_offset'][0]] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') ''' -------------- IMMEDIATE NP BEFORE MARKER ---------- ''' m = list(self.reg_apa.finditer(cit_text_doc)) m1 = list(self.reg_ieee.finditer(cit_text_doc)) m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # (start, end, group) if len(m) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m] elif len(m1) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m1] elif len(m2) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m2] else: m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) if len(m3) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m3] else: markers = [] if len(markers) > 10000: nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip()) if nps is None: q = cleaned else: t = nps.split(' ') concepts = [] for i in range(len(t)): conc = [] toks = t[i].split('/') while(('NP' in toks[2]) and (i < len(t))): conc.append((toks[0], toks[6])) i += 1 if i < len(t): toks = t[i].split('/') if len(conc) > 0: concepts.append(conc) noun_phrases = [ ' '.join([s1[0] for s1 in t1]) for t1 in concepts] # nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # nps = [a[1:-1] for sent in nps for piece in sent for a in piece] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] tokens = self.tokenizer.tokenize(cit_text) tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) nearest = '' nearest_idx = -1 distance = 100000 # find nearest word to the citation marker for idx, f in enumerate(tokens_offsets): # check to see if in valid span (not citation markers) invalid = False for e in markers: if f[0] >= e[0] and f[1] <= e[1]: invalid = True if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ (cit_mrk_offset_sent[0] - f[1] < distance) and\ not invalid: distance = cit_mrk_offset_sent[0] - f[1] if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: nearest = tokens[idx] if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0: nearest = tokens[ idx - 1] + ' ' + tokens[idx] nearest_idx = idx elif (cit_mrk_offset_sent[0] < f[1]): break if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\ tokens[nearest_idx] not in stops100: nearest = tokens[idx - 1] + ' ' + tokens[idx] largest = 0 q = '' for n in noun_phrases: if (nearest in n) and (len(nearest.split()) > largest): q = '"%s"' % nearest largest = len(nearest.split()) if q == '': q = cleaned q = sanitize(q) # find longest noun phrase containing the nearest # res = None # for np in nps[0]: # if nearest in np and len(np) > longest and len(np) < 5: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest else: try: qtxt = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore') except: qtxt = cleaned.encode('ascii', 'ignore') qterms = [qtxt] tokens = self.tokenizer.tokenize(' '.join(qterms)) # tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer) q = ' '.join([t for t in tokens if (t not in self.stopwords and not(self.all_digits(t)))]) if self.opts.concept_boost > 0: qconcepts = mmrun(cleaned) qcids = [] for cdata in qconcepts['concepts']: newterms = self.expand_concept(cdata) if newterms is not None: qcids.append(newterms) else: qcids = [] if self.opts.np_boost > 0: nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened') noun_phs = set() for e in nps: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords: tmp = ' '.join(sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join(sub_e.replace('"', '') for sub_e in e1) if '"' + tmp.replace('"', '') + '"' not in noun_phs and not all_stop: noun_phs.add( '"' + tmp.replace('"', '') + '"') else: noun_phs = [] if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: # r = self.es_int.multi_field_search(sentence=q, # concepts=' '.join( # [w for w in qcids]), # noun_phrases=' '.join( # [e for e in noun_phs]), # maxsize=self.opts.maxsize, # source_fields=[ # 'offset', 'sentence', 'mm-concepts', 'noun_phrases'], # doc_type=doc_type, # field_boost=[self.opts.sent_boost, # self.opts.concept_boost, # self.opts.np_boost]) fields = [ 'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed'] tokens1 = [] for w in self.tokenizer.tokenize(cleaned): Okay = True if self.opts.remove_stopwords: if w in self.stopwords: Okay = False if '-' in w: tokens1.append(self.stemmer.stem(w.replace('-', ''))) if Okay: tokens1.append(self.stemmer.stem(w)) field_vals = [q, ' '.join([w for w in qcids]), (' '.join([e for e in noun_phs])).replace( '"', ''), ' '.join([w for w in tokens1])] field_boosts = [ self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost] r = self.es_int.multi_field_search(field_vals=field_vals, fields=fields, source_fields=[ 'offset', 'sentence'], maxsize=self.opts.maxsize, field_boost=field_boosts, doc_type=doc_type) # r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence']) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'score': 0, 'sentence': [''], 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) return out_results
class Tokenizer(object): def __init__(self): self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]
class NLPCorpus(object): def __init__(self, corpus, tokenize_str, delimiter, n, max_length): self.corpus = corpus self.tokenizer = RegexpTokenizer(tokenize_str) self.delimiter = delimiter self.n = n self.max_length = max_length # use set methods to set these variables self.tokenized_corpus = [] self.startList = [] self.ngramDict = defaultdict(list) self.unigramDict = defaultdict(list) self.set_tokenized_corpus() self.set_ngrams() def set_tokenized_corpus(self): self.tokenized_corpus = [self.tokenizer.tokenize(sentence) for sentence in self.corpus.split(self.delimiter)] # the last member is always empty, so remove it self.tokenized_corpus.pop() def set_ngrams(self): for sentence in self.tokenized_corpus: length = len(sentence) #append empty string to indicate the end of a sentence sentence.append('') if(length >= self.n): self.startList.append(tuple(sentence[0:self.n])) for i in range(length): self.unigramDict[sentence[i]].append(sentence[i+1]) if i <= (length - self.n): self.ngramDict[tuple(sentence[i:i+self.n])].append(sentence[i+self.n]) else: self.startList.append(tuple(sentence)) [self.unigramDict[sentence[j]].append(sentence[j+1]) for j in range(length)] self.ngramDict[tuple(sentence)].append('') def generate_sentence(self): # the start of a generated sentence is always the start of a sentence from the corpus key = choice(self.startList) sentence = list(key) sentence_length = len(" ".join(sentence)) # keep track of how many n-grams only have a single choice as the following word single_choice = 0 while True: if len(self.ngramDict[key]) == 1: single_choice += 1 # use a unigram to select the next word to add more variety if single_choice != 3: # select one of the words mapped to the current ngram key word = choice(self.ngramDict[key]) else: word = choice(self.unigramDict[key[1]]) single_choice = 0 sentence_length += len(word) + 1 if sentence_length <= self.max_length and word: sentence.append(word) key = key[1:] + (word,) else: break return " ".join(sentence)
def __init__(self): """Setup a new connection""" print(yellow("Initializing new YakDB connection")) self.db = YakDBDocumentDatabase() # Initialize NLTK objects self.nerTokenizer = RegexpTokenizer(r'\s+', gaps=True)
# lire le fichier "movies_coms.csv' df = pd.read_csv('/home/paul/Downloads/movies_comments.csv') # Afficher les deux premières lignes de df df.head(10) # Définition du dictionnaire : sunText = "" for words in df.Text: sunText += " " + words from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize.regexp import RegexpTokenizer tokenizer = RegexpTokenizer("[a-zA-Zé]{4,}") vectorizer = CountVectorizer() vectorizer.fit_transform(tokenizer.tokenize(sunText.lower())) # Vectorization des mots : def vect1(words): liste = [] tokens = tokenizer.tokenize(words.lower()) for word in tokens: liste.append(vectorizer.transform([word]).toarray()) return np.asarray(liste) def convY(word_conv):
def tokenize(text): str_ = "[A-Za-z]+" regex_tokens = RegexpTokenizer(str_) tokens = regex_tokens.tokenize(text.lower()) stems = stem_tokens(tokens, WNL) return stems
def __init__(self): RegexpTokenizer.__init__(self, r'[\w+#]+|[^\w\s]+')
def get_word_symbols_tokens(cls, text): tokenizer = RegexpTokenizer('\s+', gaps=True) return tokenizer.tokenize(text)
def get_word_tokens(cls, text): tokenizer = RegexpTokenizer('\w+') return tokenizer.tokenize(text)
def no_stop_tokens(self,text): tokens = [] tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') tokens += tokenizer.tokenize(text) return tokens
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 3}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) def run(self, test_data): # with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf: # json.dump(test_data, mf, indent=2) out_results = [] det_res = {} for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('train', 'eval') doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('eval', 'train') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') # tokens = self.es_int.tokenize(q, "sentence") tokens = self.tokenizer.tokenize(q) tokens = ['"' + t + '"' if '-' in t else t for t in tokens] q = ' '.join([t for t in tokens if (t not in self.stopwords and t not in authors and not(self.all_digits(t)))]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() # q = '*:*' if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q e['topic'] = ann['topic_id'].lower() if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) # with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf: # json.dump(out_results, mf, indent=2) # sys.exit() return out_results
def words(self,text): reg_words = r'[\#|\$|\w|َ|ِ|ً|ٌ|ٍ|ْ|ّ|ُ]+' tokenizer = RegexpTokenizer(reg_words, flags=re.UNICODE|re.IGNORECASE) return tokenizer.tokenize(text)
def __init__(self): RegexpTokenizer.__init__(self, r'\w+[-\w+]*|[^\w\s]+')
def __init__(self): self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b') self.stemmer = LancasterStemmer()
class TranslatronProtocol(WebSocketServerProtocol): def __init__(self): """Setup a new connection""" print(yellow("Initializing new YakDB connection")) self.db = YakDBDocumentDatabase() # Initialize NLTK objects self.nerTokenizer = RegexpTokenizer(r'\s+', gaps=True) def onConnect(self, request): pass def onOpen(self): pass def performDocumentSearch(self, query): """ Perform a token search on the document database. Search is performed in multi-token prefix (all must hit) mode. Tokens with no hits at all are ignored entirely """ startTime = time.time() queryTokens = map(str.lower, word_tokenize(query)) levels = [b"title", b"content", b"metadata"] #Remove 1-token parts from the query -- they are way too general! #Also remove exclusively-non-alnum tokens queryTokens = [tk for tk in queryTokens if (len(tk) > 1 and has_alpha_chars(tk))] results = self.db.searchDocumentsMultiTokenPrefix(queryTokens, levels=levels) #Return only those paragraphs around the hit paragraph (or the first 3 pararaphs) for hitLocation, doc in results.items(): (docId, docLoc) = InvertedIndex.splitEntityIdPart(hitLocation) #Compute which paragraphs to display minShowPar = 0 maxShowPar = 2 if docLoc.startswith(b"paragraph"): paragraphNo = int(docLoc[9:]) minShowPar = max(0, paragraphNo - 1) maxShowPar = min(len(doc[b"paragraphs"]), paragraphNo + 1) #Modify documents results[hitLocation][b"hitLocation"] = docLoc results[hitLocation][b"paragraphs"] = doc[b"paragraphs"][minShowPar:maxShowPar] # Measure timing timeDiff = (time.time() - startTime) * 1000.0 print("Document search for %d tokens took %.1f milliseconds" % (len(queryTokens), timeDiff)) return results def uniquifyEntities(self, entities): """Remove duplicates from a list of entities (key: ["id"])""" seen = set() result = [] for entity in entities: itemId = entity[b"id"] if itemId in seen: continue seen.add(itemId) result.append(entity) return result def performEntitySearch(self, query): """ Search entities. Tokens are not splitted in order to allow simple search for multi-token entities like "Biological process" """ results = self.db.searchEntitiesSingleTokenMultiExact([query], level=b"aliases") #Return only result array. TODO can't we just use results[query] if query not in results: return [] return results[query] def filterNERTokens(self, token): """ Filter function to remove stuff that just clutters the display. """ #Short numbers are NOT considered database IDs. #NOTE: In reality, pretty much all numbers are Allergome database IDs, e.g. see # http://www.allergome.org/script/dettaglio.php?id_molecule=14 if len(token) <= 5 and token.isdigit(): return False return True def performEntityNER(self, query): "Search a query text for entity/entity alias hits" startTime = time.time() tokens = self.nerTokenizer.tokenize(query) queryTokens = [s.encode("utf-8") for s in tokens] # Search for case-sensitive hits searchFN = InvertedIndex.searchSingleTokenMultiExact results = searchFN(self.db.entityIdx.index, frozenset(filter(self.filterNERTokens, queryTokens)), level=b"aliases") # Results contains a list of tuples (dbid, db) for each hit. dbid is db + b":" + actual ID # We only need the actual ID, so remove the DBID prefix (which is required to avoid inadvertedly merging entries). # This implies that the DBID MUST contain a colon! results = {k: [(a.partition(b":")[2], b) for (a, b) in v] for k, v in results.items() if v} # # Multi-token NER # Based on case-insensitive entries where only the first token is indexed. # # TESTING: Multi token NER lowercaseQueryTokens = [t.lower() for t in queryTokens] t1 = time.time() ciResults = searchFN(self.db.entityIdx.index, frozenset(lowercaseQueryTokens), level=b"cialiases") t2 = time.time() print("TX " + str(t2 - t1)) for (firstTokenHit, hits) in ciResults.items(): #Find all possible locations where the full hit could start, i.e. where the first token produced a hit possibleHitStartIndices = [i for i, x in enumerate(lowercaseQueryTokens) if x == firstTokenHit] #Iterate over all possible for hit in hits: hitLoc, _, hitStr = hit[1].rpartition(b"\x1D") # Full (whitespace separated) entity name if not hitStr: continue #Ignore malformed entries. Should usually not happen hitTokens = [t.lower() for t in hitStr.split()] numTokens = len(hitTokens) #Check if at any possible hit start index the same tokens occur (in the same order ) for startIdx in possibleHitStartIndices: actualTokens = lowercaseQueryTokens[startIdx : startIdx+numTokens] #Check if the lists are equal. Shortcut for single-token hits if numTokens == 1 or all((a == b for a, b in zip(actualTokens, hitTokens))): #Reconstruct original (case-sensitive) version of the hit csTokens = queryTokens[startIdx : startIdx+numTokens] #NOTE: This MIGHT cause nothing to be highlighted, if the reconstruction # of the original text is not equal to the actual text. This is true exactly # if the tokenizer removes or changes characters besides whitespace in the text. csHit = b" ".join(csTokens) # Emulate defaultdict behaviour if not csHit in results: results[csHit] = [] results[csHit].append((hitStr, hitLoc)) t3 = time.time() print("TY " + str(t3 - t2)) # TODO: Remove results which are subsets of other hits. This occurs only if we have multi-token results removeKeys = set() # Can't modify dict while iterating it, so aggregate keys to delete for key in results.keys(): # Ignore single part results if any((chr(c).isspace() for c in key)): tokens = key.split() for token in tokens: # Remove sub-hit in results. # This avoids the possibility of highlighting the smaller hit if token in results: removeKeys.add(token) # Remove aggregated keys for key in removeKeys: del results[key] # Result: For each token with hits --> (DBID, Database name) # Just takes the first DBID.It is unlikely that different DBIDs are found, but we # can only link to one using the highlighted label ret = {k: (v[0][0], v[0][1]) for k, v in results.items() if v} # Measure timing timeDiff = (time.time() - startTime) * 1000.0 print("NER for %d tokens took %.1f milliseconds" % (len(queryTokens), timeDiff)) return ret def onMessage(self, payload, isBinary): request = json.loads(payload.decode('utf8')) # Perform action depending on query type qtype = request["qtype"] if qtype == "docsearch": results = self.performDocumentSearch(request["term"]) del request["term"] request["results"] = list(results.values()) elif qtype == "ner": results = self.performEntityNER(request["query"]) del request["query"] request["results"] = results elif qtype == "metadb": # Send meta-database to generate request["results"] = metaDB elif qtype == "entitysearch": request["entities"] = self.performEntitySearch(request["term"]) del request["term"] elif qtype == "getdocuments": # Serve one or multiple documents by IDs docIds = [s.encode() for s in request["query"]] request["results"] = self.db.docIdx.findEntities(docIds) del request["query"] else: print(red("Unknown websocket request type: %s" % request["qtype"], bold=True)) return # Do not send reply #Return modified request object: Keeps custom K/V pairs but do not re-send query self.sendMessage(json.dumps(request, default=documentSerializer).encode("utf-8"), False) def onClose(self, wasClean, code, reason): print("WebSocket connection closed: {0}".format(reason))