Python RegexpTokenizer Examples, nltk.tokenize.regexp.RegexpTokenizer Python Examples

Example #1

0

Show file

File: fbcloud.py Project: invinciblejha/MachineLearning-3

    def frequencyGenerator(self, s):
        
        pat = '[0-9|.| |\-|\[|\]|-|!|,|\\n|\\|/|:|"|(|)|=|<|>|@|\'|#]'

        tokenizer= RegexpTokenizer(pat,gaps=True)
        allWords = tokenizer.tokenize(s)
        #stemmer = WordnetStemmer()
        allWords = map(lambda x:x.lower().strip(),allWords)
        #allWordsStemmed = map(lambda x: stemmer.lemmatize(x),allWords)        
        #del(allWords)
        allWords = self.cleanedWords(allWords)
        allWordsStemmed = allWords

        allWordsStemmed = filter(lambda x:len(x)>2,allWordsStemmed)
        #allWordsStemmed = filter(lambda x:len(x)>2,map(lambda x: stemmer.lemmatize(x),allWords))
        
        dic={}
        for i in allWordsStemmed:
            if dic.has_key(i):
                dic[i] = dic[i]+1
            else:
                dic[i]= 1

        st=''
        dic=sorted(dic.items(), key=lambda(k,v):(v,k),reverse=True)

        for k in dic:
            try:
                st+=str(k[0])+','+str(k[1])+','
            except:
                pass

        print st

Example #2

0

Show file

    def parse(self, fname):
        """
        Парсинг текста файла
        :param fname: имя файла
        :return: (<имя_файла>, тошнота, мошенничество)
        """
        density, fraud = 0, 0
        with codecs.open(fname, "r", encoding="utf-8") as f:
            text = f.read()
        tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+")
        txt_list = tknz.tokenize(text)
        if txt_list:
            for i, word in enumerate(txt_list):
                new_word = self.check_word(word)
                if new_word:
                    txt_list[i] = new_word
                    fraud += 1

            txt_list = [
                word.lower() for word in txt_list
                if not (word.lower() in self.sw)
            ]
            stemmer_ru = RussianStemmer()
            txt_list = [
                stemmer_ru.stem(token.lower()) for token in txt_list
                if len(token) > 1
            ]
            dict_w = Counter(txt_list)
            top5 = heapq.nlargest(5, dict_w, key=dict_w.get)
            top5_count = sum([dict_w[word] for word in top5])
            density = top5_count / len(txt_list)
        # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке
        # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать
        # готов обсуждать этот критерий, возможно исправить каким то образом
        return fname, density, fraud > 2

Example #3

0

Show file

File: inaugural.py Project: chrono721/msan622

def tokenize(text):
    #This regex is edited to accept character words only.
    str_ = "[A-Za-z]+"
    regex_tokens = RegexpTokenizer(str_)
    tokens = regex_tokens.tokenize(text.lower())   
    stems = stem_tokens(tokens, WNL)
    return stems

Example #4

0

Show file

File: utils.py Project: erickrf/rte-bootstrapper

def tokenize_sentence(text, preprocess=True):
    '''
    Tokenize the given sentence and applies preprocessing if requested 
    (conversion to lower case and digit substitution).
    '''
    if preprocess:
        text = re.sub(r'\d', '9', text.lower())

    tokenizer_regexp = ur'''(?ux)
    ([^\W\d_]\.)+|                # one letter abbreviations, e.g. E.U.A.
    \d{1,3}(\.\d{3})*(,\d+)|      # numbers in format 999.999.999,99999
    \d{1,3}(,\d{3})*(\.\d+)|      # numbers in format 999,999,999.99999
    \d+:\d+|                      # time and proportions
    \d+([-\\/]\d+)*|              # dates. 12/03/2012 12-03-2012
    [DSds][Rr][Aa]?\.|            # common abbreviations such as dr., sr., sra., dra.
    [Mm]\.?[Ss][Cc]\.?|           # M.Sc. with or without capitalization and dots
    [Pp][Hh]\.?[Dd]\.?|           # Same for Ph.D.
    [^\W\d_]{1,2}\$|              # currency
    (?:(?<=\s)|^)[\#@]\w*[A-Za-z_]+\w*|  # Hashtags and twitter user names
    -[^\W\d_]+|                   # clitic pronouns with leading hyphen
    \w+([-']\w+)*|                # words with hyphens or apostrophes, e.g. não-verbal, McDonald's
    -+|                           # any sequence of dashes
    \.{3,}|                       # ellipsis or sequences of dots
    \S                            # any non-space character
    '''
    tokenizer = RegexpTokenizer(tokenizer_regexp)

    return tokenizer.tokenize(text)

Example #5

0

Show file

File: tokenizer.py Project: zhangAlwin/tokenquery

    def tokenize(self, text):
        """
           tokenize text into a list of Token objects

            :param text: text to be tokenized (might contains several sentences)
            :type text: str
            :return: List of Token objects
            :rtype: list(Token)
        """
        tokens = []

        if self.tokenizer_type == "SpaceTokenizer":
            operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
            operator = WhitespaceTokenizer()
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "PTBTokenizer":
            ptb_tokens = word_tokenize(text)
            counter = 0
            for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
                new_token = Token(counter, token, span[0], span[1])
                counter += 1
                tokens.append(new_token)

        return tokens

Example #6

0

Show file

def get_emails_sent_by_person_list(emails_df):
    tokenizer = RegexpTokenizer(r'(?u)\b\w\w+\b')
    emails_df['subject_wc'] = emails_df['subject'].map(
        lambda x: len(tokenizer.tokenize(x)))
    emails_df['content_wc'] = emails_df['content'].map(
        lambda x: len(tokenizer.tokenize(x)))

    grouped_by_people = emails_df.groupby('from').agg({
        'content': 'count',
        'subject_wc': 'mean',
        'content_wc': 'mean',
    })

    grouped_by_people.rename(columns={
        'content': 'N emails',
        'subject_wc': 'Subject word count',
        'content_wc': 'Content word count'
    },
                             inplace=True)

    grouped_by_people.sort_values(by=['N emails'], ascending=False)

    file_path_send = file_path = os.path.join(dir_path,
                                              'results/emails_by_person.csv')

    grouped_by_people.to_csv(file_path_send)

Example #7

0

Show file

File: dstats.py Project: dhlee49/Song-Analytic

def token_words(lyric):
    """
    in: lyric(element of row['text'])
    take whole lyric and convert it into list of words for analysis
    apply few cleaning processes tot remove punctuation & stopwords & errors(Minor focus on this)
    return: list of words in the lyric
    """
    lyric = lyric.lower()
    """
     tokenizer that will tokenize lyric('text') into words without punctuation
     it will split aphostrophe words into 2 seperate words but its okay
     as most of the time words with aphostrophe are non-main verbs(would,should,etc)
     non-main verbs are usually insignificant in most of the context and will be deleted
     e.g : would've = would ve but this is fine as we know stopwords will remove ve
     tweetTokenizer was producing very irregular words in lyric such as (8, numbers and was dist
    """
    #apply tokenizer
    tokenizer1 = RegexpTokenizer("[a-z]+")
    words = tokenizer1.tokenize(lyric)
    #convert list of stopwords to set of stopwords for faster access
    en_stopwords = set(stopwords.words('english'))
    #we remove stopwords in words
    #and add few words that were in the words_lyric for cleaner process
    en_stopwords.add('chorus')
    #single letters aren't really words :)
    for c in ascii_lowercase:
        en_stopwords.add(c)

    words_lyric = [w for w in words if not w in en_stopwords]

    #postProcess of words_lyric
    words_lyric = postProcess(words_lyric)

    return words_lyric

Example #8

0

Show file

 def no_stop_tokens(self,text):
     tokens = []
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     tokens += tokenizer.tokenize(text)
     #stemmer = nltk.stem.snowball.EnglishStemmer()
     #tokens = map(lambda x: stemmer.stem(x),tokens)
     return tokens

Example #9

0

Show file

File: utils.py Project: erickrf/rte-bootstrapper

def tokenize_sentence(text, preprocess=True):
    '''
    Tokenize the given sentence and applies preprocessing if requested 
    (conversion to lower case and digit substitution).
    '''
    if preprocess:
        text = re.sub(r'\d', '9', text.lower())
    
    tokenizer_regexp = ur'''(?ux)
    ([^\W\d_]\.)+|                # one letter abbreviations, e.g. E.U.A.
    \d{1,3}(\.\d{3})*(,\d+)|      # numbers in format 999.999.999,99999
    \d{1,3}(,\d{3})*(\.\d+)|      # numbers in format 999,999,999.99999
    \d+:\d+|                      # time and proportions
    \d+([-\\/]\d+)*|              # dates. 12/03/2012 12-03-2012
    [DSds][Rr][Aa]?\.|            # common abbreviations such as dr., sr., sra., dra.
    [Mm]\.?[Ss][Cc]\.?|           # M.Sc. with or without capitalization and dots
    [Pp][Hh]\.?[Dd]\.?|           # Same for Ph.D.
    [^\W\d_]{1,2}\$|              # currency
    (?:(?<=\s)|^)[\#@]\w*[A-Za-z_]+\w*|  # Hashtags and twitter user names
    -[^\W\d_]+|                   # clitic pronouns with leading hyphen
    \w+([-']\w+)*|                # words with hyphens or apostrophes, e.g. não-verbal, McDonald's
    -+|                           # any sequence of dashes
    \.{3,}|                       # ellipsis or sequences of dots
    \S                            # any non-space character
    '''
    tokenizer = RegexpTokenizer(tokenizer_regexp)
    
    return tokenizer.tokenize(text)

Example #10

0

Show file

File: sentence_split.py Project: jijiyangyu/Enron-Scripts

def create_sents(toks):
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    s = RegexpTokenizer(wordre).tokenize(toks)
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    for sentence in s:
        RegexpTokenizer(wordre).tokenize(sentence)
    return toks

Example #11

0

Show file

	def __init__(self):
		self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations
		if ver==2:
			self.stemmer = SnowballStemmer("english")         #using stemmed version of words
		elif ver==1:
			self.stemmer = LancasterStemmer()	
		else:
			self.stemmer = PorterStemmer()

Example #12

0

Show file

File: text_processor.py Project: Deepthi-Chand/iAdler

 def tokenize(self,text):
     tokens = []
     wordnet_lemmatizer = WordNetLemmatizer()
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     tokens += tokenizer.tokenize(text)
     tokens = filter(lambda x: x.lower() not in STOP_WORDS and len(x) >1 ,tokens)
     tokens = map(lambda token: wordnet_lemmatizer.lemmatize(token), tokens)
     return tokens

Example #13

0

Show file

File: readutils.py Project: nitinhardeniya/hackpredict

def readreviews(filename,header=True, fieldsep="\t"):
	''' 
		Reads traing file for LDA 
		Parameters:
			- filename: input filename
			- header: True if the header line is present; False otherwise.
			- fieldsep: separator
		Return:
			- a list of entry (where each entry is a tuple of ID, list of tokens, sale/nosale).
	'''
	review_data = list()
	prevsid = ""
	#filehandle = open(filename, "r")
	stopwords = loadstopwords("english.stop.txt")

	
	pdsnotthere=0
	title_notthere=0
	tokenizer = RegexpTokenizer('[a-z]\w+')
	with open(filename, 'rU') as filename:
		filehandle = csv.reader(filename,delimiter='\t', quotechar='"')
		#print filehandle
		for line in filehandle:
			#lineparts = line.split(fieldsep)
			fields=line
			if header:
				header=False
				continue

			if len(fields)!=12:
				continue

			print len(fields)

			"""
			print len(fields)
			print fields
			"""
			HotelID=fields[0]
			hotelname=fields[1]
			HotelURL=fields[2]
			Address=fields[3]
			ImgURL=fields[4]
			Author=fields[5]
			Price=fields[6]
			location=fields[7]
			Title=fields[8]
			ReviewID=fields[9]
			Content=fields[10]
			Rating_ovarall=fields[11]

			review_content=Title+' '+Content
			review_discription=review_content

			words = [ token for token in tokenizer.tokenize(review_discription) if token not in stopwords ]
			review_data.append( words)

	return review_data

Example #14

0

Show file

 def tokenize(self,text):
     tokens = []
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     #tokens += tokenizer.tokenize(self.title.lower())
     tokens += tokenizer.tokenize(text.lower())
     tokens = filter(lambda x: x not in STOP_WORDS and len(x) >1 ,tokens)
     #stemmer = nltk.stem.snowball.EnglishStemmer()
     #tokens = map(lambda x: stemmer.stem(x),tokens)
     return tokens

Example #15

0

Show file

File: main.py Project: llorrdnnight/Hibiki-chatbots

def emailExtractor(sentence, word):
    # https://stackoverflow.com/questions/39777806/how-to-update-nltk-package-so-that-it-does-not-break-email-into-3-different-toke
    pattern = r'\S+@[^\s.]+\.[a-zA-Z]+|\w+|[^\w\s]'
    tokeniser = RegexpTokenizer(pattern)
    for w in tokeniser.tokenize(sentence):
        if re.search('^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$', w):
            context["email"] = w
            return True
    return False

Example #16

0

Show file

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

Example #17

0

Show file

File: Preprocessor.py Project: kasperski95/wieik-si-project-v4

    def __init__(self):
        nltk.download("punkt")
        nltk.download("stopwords")
        nltk.download("wordnet")  # lemmatization

        self._tokenizer = RegexpTokenizer(r"\w+")
        self._stop_words = set(stopwords.words("english"))
        # self._stemmer = nltk.stem.SnowballStemmer("english")
        self._lemmatizer = nltk.wordnet.WordNetLemmatizer()
        self._vocabulary = set()

Example #18

0

Show file

File: tokenize_directory.py Project: hassmoha/SOMALI_NLP

def tokenize_large_text_file(file_name, file_location):
    tokens_array = set()
    file = os.path.join(file_location, file_name)
    with open(file,'r') as file:
        for line in file:
            tokenizer = RegexpTokenizer('\s+', gaps=True)
            tokens_array.update(tokenizer.tokenize(line))
    tokens_dict = {str(file_name) + " - {} tokens".format(file_name): list(tokens_array)}
    with open('tokens taken from - {} - .json'.format(str(file_name)), 'w') as f:
        json.dump(tokens_dict, f)

Example #19

0

Show file

    def calcLocation(self, token):
        len = self.instTab.searchFormat(token.operator)
        if len > 0:
            return len
        else:
            if token.operator == "RESW" or token.operator == "WORD":
                len = 3

            elif token.operator == "RESB":
                len = int(token.operand[0])

            elif token.operator == "BYTE":
                len = 1

            elif token.operator == "LTORG":
                len = self.literalTab.literalCount
                self.literalTab.setLiteralCount(0)
                count = 0
                for litCheck in self.literCheck:
                    if litCheck[1:2] == 'C':
                        len = 3
                    else:
                        len = 1
                    self.literalTab.modifyLiteral(
                        litCheck, TokenTable.locCount + (count * len))
                    count += 1

            elif token.operator == "END":
                len = self.literalTab.literalCount
                self.literalTab.setLiteralCount(0)
                count = 0
                for litCheck in self.literCheck:
                    self.literalTab.modifyLiteral(litCheck, token.location)
                    count += 1

            elif token.operator == "EQU":
                if token.operand[0] == "*":
                    len = 0

                else:

                    tokenizer = RegexpTokenizer("-", gaps=True)
                    tokens = tokenizer.tokenize(token.operand[0])

                    value1 = self.symTab.search(tokens[0])
                    value2 = self.symTab.search(tokens[1])
                    len = value1 - value2
                    self.symTab.modifySymbol(token.label, len)
                    len = 0

            else:
                len = -1

        return len

Example #20

0

Show file

File: handleCorpus.py Project: wqdbuaa/Label-recommendation

 def getDoc_set():
     tokenizer = RegexpTokenizer(r'\w+')
     for doc in getCorpus.corpus_doc:
         #             print type(doc)
         raw = doc.lower()
         tokens = tokenizer.tokenize(raw)
         en_stop = get_stop_words("en")
         stopped_tokens = [i for i in tokens if i not in en_stop]
         p_stemmer = PorterStemmer()
         texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens]
         getCorpus.doc_set.append(texts)

Example #21

0

Show file

File: search_util.py Project: open-data/ogc_search

def get_search_terms(request: HttpRequest):
    # Get any search terms

    tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False)
    search_text = str(request.GET.get('search_text', ''))
    # Respect quoted strings
    search_terms = tr.tokenize(search_text)
    if len(search_terms) == 0:
        solr_search_terms = "*"
    else:
        solr_search_terms = ' '.join(search_terms)
    return solr_search_terms

Example #22

0

Show file

File: query.py Project: open-data/oc_search

def get_search_terms(search_text: str):
    # Get any search terms

    tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False)

    # Respect quoted strings
    search_terms = tr.tokenize(search_text)
    if len(search_terms) == 0:
        solr_search_terms = "*"
    else:
        solr_search_terms = ' '.join(search_terms)
    return solr_search_terms

Example #23

0

Show file

    def __init__(self, corpus, tokenize_str, delimiter, n, max_length):
        self.corpus = corpus
        self.tokenizer = RegexpTokenizer(tokenize_str)
        self.delimiter = delimiter
        self.n = n
        self.max_length = max_length

        # use set methods to set these variables
        self.tokenized_corpus = []
        self.startList = []
        self.ngramDict = defaultdict(list)
        self.unigramDict = defaultdict(list)
        self.set_tokenized_corpus()
        self.set_ngrams()

Example #24

0

Show file

File: sentence_length.py Project: 2020-CS372/automated-essay-assignment-grading

def sentence_length(corpus):
    too_long_sentences = []
    total_sentences = 0

    tokenizer = RegexpTokenizer("\s+", gaps=True)

    articles = preprocessing(corpus)
    for article in articles:
        sentences = sent_tokenize(article)
        total_sentences += len(sentences)
        for sentence in sentences:
            words = tokenizer.tokenize(sentence)
            if (len(words) > 25):
                too_long_sentences.append((sentence, len(words)))

    return (1 - len(too_long_sentences) / total_sentences) * 100

Example #25

0

Show file

File: np.py Project: yuancz/scientific-summ

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

Example #26

0

Show file

File: handleCorpus.py Project: wqdbuaa/Label-recommendation

 def getNew_object(docs, newDoc_object):
     tokenizer = RegexpTokenizer(r'\w+')
     for doc in docs:
         #             print type(doc[1])
         raw = doc[1].lower()
         tokens = tokenizer.tokenize(raw)
         en_stop = get_stop_words("en")
         stopped_tokens = [i for i in tokens if i not in en_stop]
         p_stemmer = PorterStemmer()
         texts = {}
         #             texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens]
         #             print texts
         for i in stopped_tokens:
             texts[p_stemmer.stem(i).encode('utf-8')] = texts.get(
                 p_stemmer.stem(i).encode('utf-8'), 0) + 1
         newDoc_object.append(texts)

Example #27

0

Show file

File: morpho.py Project: marlovss/unsupmorpho

 def __init__(self):
     tokenizer_regexp = r'''(?ux)
           # the order of the patterns is important!!
           # more structured patterns come first
           [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+|    # emails
           (?:https?://)?\w{2,}(?:\.\w{2,})+(?:/\w+)*|                  # URLs
           (?:[\#@]\w+)|                     # Hashtags and twitter user names
           (?:[^\W\d_]\.)+|                  # one letter abbreviations, e.g. E.U.A.
           (?:[DSds][Rr][Aa]?)\.|            # common abbreviations such as dr., sr., sra., dra.
           (?:\B-)?\d+(?:[:.,]\d+)*(?:-?\w)*|
           # numbers in format 999.999.999,999, possibly followed by hyphen and alphanumerics
           # \B- avoids picks as F-14 as a negative number
           \.{3,}|                           # ellipsis or sequences of dots
           \w+|                              # alphanumerics
           -+|                               # any sequence of dashes
           \S                                # any non-space character
    '''
     RegexpTokenizer.__init__(self, tokenizer_regexp)

Example #28

0

Show file

File: project2b_v2.py Project: PhilosophyzJY/EE219-2017W

class tk(object):
    def __init__(self):
        self.tok = RegexpTokenizer(r'\b([a-zA-Z]+)\b')
        self.stemmer = LancasterStemmer()

    def __call__(self, doc):
        return [self.stemmer.stem(s) for s in self.tok.tokenize(doc)]

    # define the word list to be ignored
    stop_words = text.ENGLISH_STOP_WORDS

Example #29

0

Show file

File: sentence_split.py Project: jijiyangyu/Enron-Scripts

def prep_string(s):
    s = re.sub("\n", " ", s)
    s = re.sub("\>", " ", s)
    #toks = Token(TEXT=s, LOC=CharSpanLocation(0, len(s), 's'))
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    toks = RegexpTokenizer(wordre).tokenize(s)
    word_list = []
    for tok in toks:
        word_list.append(tok)
    return word_list

Example #30

0

Show file

File: description_analysis.py Project: chris-rudmin/videodiscoveries

def getTokenCount(description):
    tokens = RegexpTokenizer(r'\w+').tokenize(description)
    tokens = [w.lower() for w in tokens]
    stopwords = yaml.load(open("backend/nltk/stopwords.yaml", "r"))
    tokens = [w for w in tokens if not w in stopwords]
    tokens = [w for w in tokens if len(w) > 2]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]
    tokenCount = collections.Counter(tokens)
    return tokenCount

Example #31

0

Show file

File: InstTable.py Project: inddoni/ssu-system-programming

    def parsing(self, line):
        line = line[:-1] #문장 맨 뒤 '\n' 자르기
        # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장
        tokenizer = RegexpTokenizer("\t", gaps=True)
        tokens = tokenizer.tokenize(line)

        count = 1
        for token in tokens :
            if count == 1 :
                self.instruction = token
            elif count == 2 :
                self.format = int(token)
            elif count == 3 :
                self.opcode = int(token, 16)
            elif count == 4 :
                self.numberOfOperand = int(token)
            else :
                print("[InstTable.py] parsing() error")

            count += 1

Example #32

0

Show file

class tokenizer(object):
	def __init__(self):
		self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations
		if ver==2:
			self.stemmer = SnowballStemmer("english")         #using stemmed version of words
		elif ver==1:
			self.stemmer = LancasterStemmer()	
		else:
			self.stemmer = PorterStemmer()
	def __call__(self, doc):
		return [self.stemmer.stem(token) for token in self.tokenize.tokenize(doc)]

Example #33

0

Show file

class Extracteur_Mots(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.regexp = RegexpTokenizer("[a-z][a-z']{2,}")

    def fit(self, comments, y=None):
        return self

    def transform(self, comments, y=None):
        mots = []
        for c in comments:
            mots.append(self.regexp.tokenize(c.lower()))
        return mots

Example #34

0

Show file

File: citation_vec.py Project: yuancz/scientific-summ

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

Example #35

0

Show file

File: Token.py Project: inddoni/ssu-system-programming

    def parsing(self, line):

        line = line[:-1]  # 문장 맨 뒤 '\n' 자르기

        # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장
        tokenizer = RegexpTokenizer("\t", gaps=True)
        tokens = tokenizer.tokenize(line)

        count = 0

        for token in tokens:
            count += 1

            if count == 1:
                self.label = token
            elif count == 2:
                self.operator = token
            elif count == 3:
                opnd = token
                tokenizer = RegexpTokenizer(",", gaps=True)
                opnds = tokenizer.tokenize(opnd)
                i = 0
                for op in opnds:
                    self.operand.append(op)
                    i += 1
            elif count == 4:
                self.comment = token
            else:
                print("[TokenTable.py] parsing() error")

Example #36

0

Show file

File: ranking.py Project: kr-sh/basic-search-engine

def summarize(docid,score_dict,pos_dict):
    global cnt
    tokenizer = RegexpTokenizer('\w+')    ##creating a tokenizer to match the words
    snippet = ""
    with open(path + "//" + docid,"r") as fi:  ## open the extracted text file in read mode
        file_text = fi.read()
        tokens = tokenizer.tokenize(file_text)  ## tokenize the text file using the tokenizer

        if score_dict[docid] not in (0,-1):                 ## for normal phrase/word queries
            cnt += 1 
            for pos in pos_dict[docid]:                     ## get snippets based on identified positions from the position dictionary
                pos1 = abs(pos - 8)                         ## get the preceeding and following 8 words from the identified position in the text
                pos2 = pos + 8
                if pos1 < 0:
                    pos1 = 0
                if pos2 > len(tokens):
                    pos2 = len(tokens)
                snippet = ' '.join(tokens[pos1:pos2])       
                print docid,"\t",snippet                    ## display docid and snippet
        elif score_dict[docid] == -1:                       ## to display document ids that do not contain the negated word/phrase
            cnt += 1
            print docid

Example #37

0

Show file

File: parser.py Project: CENMetaLex/metalex-annotator

    def tokenizeText(self, text):
        ret = RegexpTokenizer(u"(\d+\u00B0(\.)?)|(nr\.)|(\d+/\d+/eg)|(\d+\:\d+\w*)|(\d+\.\d+\w*)+|[\w\d]+|(\s\w\.)|(\.)|\,|\t|[^ \t\n\r\f\v\w\d]")
        tokens = ret.tokenize(text)
        
        ntokens = []
        sentence = []
        i = -1
        for t in tokens[:-1] :
            i += 1
            if type(t) is StringType:
                t = t.decode('UTF-8')

                
            if (t.istitle() and tokens[i-1] == '.') or (regex.search(r'^\d+',t) and tokens[i+1].istitle()):
                ntokens.append(sentence)
                sentence = [t.lower().strip()]
            else :
                sentence.append(t.lower().strip())
        
        sentence.append(tokens[-1].lower().strip())
        ntokens.append(sentence)
        
        return ntokens

Example #38

0

Show file

File: nlp.py Project: cesine/tweet_roulette

    def __init__(self, corpus, tokenize_str, delimiter, n, max_length):
        self.corpus = corpus
        self.tokenizer = RegexpTokenizer(tokenize_str)
        self.delimiter = delimiter
        self.n = n
        self.max_length = max_length

        # use set methods to set these variables
        self.tokenized_corpus = []
        self.startList = []
        self.ngramDict = defaultdict(list)
        self.unigramDict = defaultdict(list)
        self.set_tokenized_corpus()
        self.set_ngrams()

Example #39

0

Show file

File: multi_fields.py Project: acohan/scientific-summ

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

Example #40

0

Show file

File: citation_vec.py Project: acohan/scientific-summ

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

Example #41

0

Show file

File: utils.py Project: chrisleewashere/nlpnet

def tokenize(text, clean=True):
    """
    Returns a list of lists of the tokens in text, separated by sentences.
    Each line break in the text starts a new list.
    
    :param clean: If True, performs some cleaning action on the text, such as replacing
        numbers for the __NUMBER__ keyword (by calling :func:`clean_text`)
    """
    ret = []
    
    if type(text) != unicode:
        text = unicode(text, 'utf-8')
    
    if clean:
        text = clean_text(text, correct=True)
    else:
        # replace numbers for __NUMBER__ and store them to replace them back
        numbers = re.findall(ur'\d+(?: \d+)*(?:[\.,]\d+)*[²³]*', text)
        numbers.extend(re.findall(ur'[²³]+', text))
        text = re.sub(ur'\d+( \d+)*([\.,]\d+)*[²³]*', '__NUMBER__', text)
        text = re.sub(ur'[²³]+', '__NUMBER__', text)
    
    # clitic pronouns
    regexp = r'''(?ux)
        (?<=\w)                           # a letter before
        -(me|
        te|
        o|a|no|na|lo|la|se|
        lhe|lho|lha|lhos|lhas|
        nos|
        vos|
        os|as|nos|nas|los|las|            # unless if followed by more chars
        lhes)(?![-\w])                    # or digits or hyphens
    '''
    text = re.sub(regexp, r'- \1', text)
    
    regexp = ur'''(?ux)
    # the order of the patterns is important!!
    ([^\W\d_]\.)+|                # one letter abbreviations, e.g. E.U.A.
    __NUMBER__:__NUMBER__|        # time and proportions
    [DSds][Rr][Aa]?\.|            # common abbreviations such as dr., sr., sra., dra.
    [^\W\d_]{1,2}\$|              # currency
    \w+([-']\w+)*-?|              # words with hyphens or apostrophes, e.g. não-verbal, McDonald's
                                  # or a verb with clitic pronoun removed (trailing hyphen is kept)
    -+|                           # any sequence of dashes
    \.{3,}|                       # ellipsis or sequences of dots
    __LINK__|                     # links found on wikipedia
    \S                            # any non-space character
    '''
    
    # loads trained model for tokenizing Portuguese sentences (provided by NLTK)
    sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
    
    # the sentence tokenizer doesn't consider line breaks as sentence delimiters, so
    # we split them manually.
    sentences = []
    lines = text.split('\n')
    for line in lines:
        sentences.extend(sent_tokenizer.tokenize(line, realign_boundaries=True))
    
    t = RegexpTokenizer(regexp)
    
    for p in sentences:
        if p.strip() == '':
            continue
        
        # Wikipedia cleaning 
        if clean:
            # discard sentences with troublesome templates or links
            if any((x in p for x in ['__TEMPLATE__', '{{', '}}', '[[', ']]'])):
                continue
        
        new_sent = t.tokenize(p)
        
        if clean:
            # discard sentences that are a couple of words (it happens sometimes
            # when extracting data from lists).
            if len(new_sent) <= 2:
                continue
        elif len(numbers) > 0:
            # put back numbers that were previously replaced
            for i in xrange(len(new_sent)):
                token = new_sent[i]
                while '__NUMBER__' in token:
                    token = token.replace('__NUMBER__', numbers.pop(0), 1)
                new_sent[i] = token
        
        ret.append(new_sent)
        
    return ret

Example #42

0

Show file

File: reader.py Project: HarshitBangar/Feed-Fetcher

    return string.replace(u"\u2019", '')

def stringFromHTMLParagraph(paraWithTags):
    paraString = ''
    for taggedString in paraWithTags.strings:
        paraString += removeApostrophe(taggedString.string)
    return paraString

def titleFromArticleSoup(soup):
    titleDiv = soup.find(class_ = 'story-heading')
    if not titleDiv:
        titleDiv = soup.find(class_ = 'entry-title')
    return unicode(removeApostrophe(titleDiv.string))

# Set up the tokenizer and the tagger
tokenizer = RegexpTokenizer(r'\w+')
tagger = UnigramTagger(treebank.tagged_sents())

# Open up a redis connection
redisInterface = RedisInterface()

# Print status
print 'Reader ONLINE'

# Run the wait-execute loop
while True:

    while not redisInterface.hasPending():
        sleep(1)

    page = redisInterface.popPending()

Example #43

0

Show file

File: multi_fields.py Project: acohan/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int},
                   'concept_boost': {'default': 3, 'type': int},
                   'np_boost': {'default': 3, 'type': int},
                   'sent_boost': {'default': 1, 'type': int},
                   'stem_boost': {'default': 1, 'type': int},
                   'runmode': {'default': 'train'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata, synonyms=False):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            if synonyms:
                return self.concept_synonyms(cdata['ConceptId'])
            else:
                return cdata['ConceptId']

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")")
    #         query = 'select * from (select distinct STR from MRCONSO a,'+\
    #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
    #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
#             print query
            self.cur.execute(query)

#         self.cur.execute("select STR from MRCONSO where " +
#                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
#                          termtypes + " and SAB != 'CHV'")

            syns = set(filter(lambda y: y.replace(" ", "").isalpha(),
                              [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(
                ann['topic_id'].lower(), ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[
                ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                                   ann['citation_marker_offset'][1] - ann['citation_offset'][0]]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')

            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0))
                               for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while(('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts]

    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
        #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
    #                 for e in nps:
    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0:
                                    nearest = tokens[
                                        idx - 1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD',
                                                 cleaned).encode('ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                tokens = self.tokenizer.tokenize(' '.join(qterms))
    #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.concept_boost > 0:

                    qconcepts = mmrun(cleaned)
                    qcids = []
                    for cdata in qconcepts['concepts']:
                        newterms = self.expand_concept(cdata)
                        if newterms is not None:
                            qcids.append(newterms)
                else:
                    qcids = []
                if self.opts.np_boost > 0:
                    nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened')
                    noun_phs = set()
                    for e in nps:
                        for e1 in e:
                            if len(e1) < 4:
                                all_stop = False
                                if self.opts.remove_stopwords:
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords)
                                else:
                                    count = 0
                                    for sub_e in e1:
                                        if sub_e.replace('"', '') in self.stopwords:
                                            count += 1
                                    if count == len(e1):
                                        all_stop = True
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1)
                                if '"' + tmp.replace('"', '') + '"' not in noun_phs and not all_stop:
                                    noun_phs.add(
                                        '"' + tmp.replace('"', '') + '"')
                else:
                    noun_phs = []

            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                #                 r = self.es_int.multi_field_search(sentence=q,
                #                                                    concepts=' '.join(
                #                                                        [w for w in qcids]),
                #                                                    noun_phrases=' '.join(
                #                                                        [e for e in noun_phs]),
                #                                                    maxsize=self.opts.maxsize,
                #                                                    source_fields=[
                #                                                        'offset', 'sentence', 'mm-concepts', 'noun_phrases'],
                #                                                    doc_type=doc_type,
                #                                                    field_boost=[self.opts.sent_boost,
                #                                                                 self.opts.concept_boost,
                # self.opts.np_boost])
                fields = [
                    'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed']
                tokens1 = []
                for w in self.tokenizer.tokenize(cleaned):
                    Okay = True
                    if self.opts.remove_stopwords:
                        if w in self.stopwords:
                            Okay = False
                    if '-' in w:
                        tokens1.append(self.stemmer.stem(w.replace('-', '')))
                    if Okay:
                        tokens1.append(self.stemmer.stem(w))
                field_vals = [q, ' '.join([w for w in qcids]),
                              (' '.join([e for e in noun_phs])).replace(
                                  '"', ''),
                              ' '.join([w for w in tokens1])]
                field_boosts = [
                    self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost]
                r = self.es_int.multi_field_search(field_vals=field_vals,
                                                   fields=fields,
                                                   source_fields=[
                                                       'offset', 'sentence'],
                                                   maxsize=self.opts.maxsize,
                                                   field_boost=field_boosts,
                                                   doc_type=doc_type)
#             r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence'])
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'score': 0,
                          'sentence': [''],
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
                
                
            out_results.append(r)
        return out_results

Example #44

0

Show file

File: project2_ab.py Project: eprym/EE-239AS

class Tokenizer(object):  
    def __init__(self):
        self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b')
        self.stemmer = LancasterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]

Example #45

0

Show file

File: nlp.py Project: cesine/tweet_roulette

class NLPCorpus(object):

    def __init__(self, corpus, tokenize_str, delimiter, n, max_length):
        self.corpus = corpus
        self.tokenizer = RegexpTokenizer(tokenize_str)
        self.delimiter = delimiter
        self.n = n
        self.max_length = max_length

        # use set methods to set these variables
        self.tokenized_corpus = []
        self.startList = []
        self.ngramDict = defaultdict(list)
        self.unigramDict = defaultdict(list)
        self.set_tokenized_corpus()
        self.set_ngrams()

    def set_tokenized_corpus(self):
        self.tokenized_corpus = [self.tokenizer.tokenize(sentence) for sentence in self.corpus.split(self.delimiter)]
        # the last member is always empty, so remove it
        self.tokenized_corpus.pop()

    def set_ngrams(self):
        for sentence in self.tokenized_corpus:
            length = len(sentence)
            #append empty string to indicate the end of a sentence
            sentence.append('')
            if(length >= self.n):
                self.startList.append(tuple(sentence[0:self.n]))
                for i in range(length):
                    self.unigramDict[sentence[i]].append(sentence[i+1])
                    if i <= (length - self.n):
                        self.ngramDict[tuple(sentence[i:i+self.n])].append(sentence[i+self.n])
            else:
                self.startList.append(tuple(sentence))
                [self.unigramDict[sentence[j]].append(sentence[j+1]) for j in range(length)]
                self.ngramDict[tuple(sentence)].append('')

    def generate_sentence(self):
        # the start of a generated sentence is always the start of a sentence from the corpus
        key = choice(self.startList)
        sentence = list(key)
        sentence_length = len(" ".join(sentence))

        # keep track of how many n-grams only have a single choice as the following word
        single_choice = 0

        while True:
            if len(self.ngramDict[key]) == 1:
                single_choice += 1
            # use a unigram to select the next word to add more variety
            if single_choice != 3:
                # select one of the words mapped to the current ngram key
                word = choice(self.ngramDict[key])
            else:
                word = choice(self.unigramDict[key[1]])
                single_choice = 0
            sentence_length += len(word) + 1
            if sentence_length <= self.max_length and word:
                sentence.append(word)
                key = key[1:] + (word,)
            else:
                break
        return " ".join(sentence)

Example #46

0

Show file

File: WebsocketInterface.py Project: ulikoehler/Translatron

 def __init__(self):
     """Setup a new connection"""
     print(yellow("Initializing new YakDB connection"))
     self.db = YakDBDocumentDatabase()
     # Initialize NLTK objects
     self.nerTokenizer = RegexpTokenizer(r'\s+', gaps=True)

Example #47

0

Show file

File: tensorflow_test.py Project: pauldechorgnat/sandbox

    # lire le fichier "movies_coms.csv'
    df = pd.read_csv('/home/paul/Downloads/movies_comments.csv')

    # Afficher les deux premières lignes de df
    df.head(10)

    # Définition du dictionnaire :
    sunText = ""
    for words in df.Text:
        sunText += " " + words

    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.tokenize.regexp import RegexpTokenizer

    tokenizer = RegexpTokenizer("[a-zA-Zé]{4,}")
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(tokenizer.tokenize(sunText.lower()))


    # Vectorization des mots :
    def vect1(words):
        liste = []
        tokens = tokenizer.tokenize(words.lower())
        for word in tokens:
            liste.append(vectorizer.transform([word]).toarray())

        return np.asarray(liste)


    def convY(word_conv):

Example #48

0

Show file

File: DataVis_Extract.py Project: chrono721/msan622

def tokenize(text):
    str_ = "[A-Za-z]+"
    regex_tokens = RegexpTokenizer(str_)
    tokens = regex_tokens.tokenize(text.lower())   
    stems = stem_tokens(tokens, WNL)
    return stems

Example #49

0

Show file

File: nlp.py Project: colgur/reader_pipeline

 def __init__(self):
    RegexpTokenizer.__init__(self, r'[\w+#]+|[^\w\s]+')

Example #50

0

Show file

File: service.py Project: guiferviz/word-count-telegram-bot

 def get_word_symbols_tokens(cls, text):
     tokenizer = RegexpTokenizer('\s+', gaps=True)
     return tokenizer.tokenize(text)

Example #51

0

Show file

File: service.py Project: guiferviz/word-count-telegram-bot

 def get_word_tokens(cls, text):
     tokenizer = RegexpTokenizer('\w+')
     return tokenizer.tokenize(text)

Example #52

0

Show file

File: text_processor.py Project: Deepthi-Chand/iAdler

 def no_stop_tokens(self,text):
     tokens = []
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     tokens += tokenizer.tokenize(text)
     return tokens

Example #53

0

Show file

File: citation_vec.py Project: acohan/scientific-summ

class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 3},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

    def run(self, test_data):
        #         with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf:
        #             json.dump(test_data, mf, indent=2)
        out_results = []
        det_res = {}
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('train', 'eval')
            doc_type = doc_type.replace(',', '').replace("'", '"')

            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('eval', 'train')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
#             tokens = self.es_int.tokenize(q, "sentence")
            tokens = self.tokenizer.tokenize(q)
            tokens = ['"' + t + '"' if '-' in t else t for t in tokens]
            q = ' '.join([t for t in tokens
                          if (t not in self.stopwords and
                              t not in authors and
                              not(self.all_digits(t)))])

            if self.opts.ngram:
                tokens = self.es_int.tokenize(q, "sentence")
                new_query = ''
                for i in range(len(tokens) - self.opts.ngram):
                    tmp = ''
                    for j in range(i, i + self.opts.ngram):
                        tmp += tokens[j] + ' '
                    new_query += '"' + tmp.strip() + '" '
                q = new_query.strip()
#             q = '*:*'
            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
                e['topic'] = ann['topic_id'].lower()

            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
#         with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf:
#             json.dump(out_results, mf, indent=2)
#         sys.exit()
        return out_results

Example #54

0

Show file

File: MyToolKit.py Project: djidan10/Arabic-Diacritizer

 def words(self,text):
     reg_words = r'[\#|\$|\w|َ|ِ|ً|ٌ|ٍ|ْ|ّ|ُ]+'
     tokenizer = RegexpTokenizer(reg_words, flags=re.UNICODE|re.IGNORECASE)
     return tokenizer.tokenize(text)

Example #55

0

Show file

File: text.py Project: bmuller/readembedability

 def __init__(self):
     RegexpTokenizer.__init__(self, r'\w+[-\w+]*|[^\w\s]+')

Example #56

0

Show file

File: project2_ab.py Project: eprym/EE-239AS

 def __init__(self):
     self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b')
     self.stemmer = LancasterStemmer()

Example #57

0

Show file

File: WebsocketInterface.py Project: ulikoehler/Translatron

class TranslatronProtocol(WebSocketServerProtocol):
    def __init__(self):
        """Setup a new connection"""
        print(yellow("Initializing new YakDB connection"))
        self.db = YakDBDocumentDatabase()
        # Initialize NLTK objects
        self.nerTokenizer = RegexpTokenizer(r'\s+', gaps=True)

    def onConnect(self, request):
        pass

    def onOpen(self):
        pass

    def performDocumentSearch(self, query):
        """
        Perform a token search on the document database.
        Search is performed in multi-token prefix (all must hit) mode.
        Tokens with no hits at all are ignored entirely
        """
        startTime = time.time()
        queryTokens = map(str.lower, word_tokenize(query))
        levels = [b"title", b"content", b"metadata"]
        #Remove 1-token parts from the query -- they are way too general!
        #Also remove exclusively-non-alnum tokens
        queryTokens = [tk for tk in queryTokens if (len(tk) > 1 and has_alpha_chars(tk))]
        results = self.db.searchDocumentsMultiTokenPrefix(queryTokens, levels=levels)
        #Return only those paragraphs around the hit paragraph (or the first 3 pararaphs)
        for hitLocation, doc in results.items():
            (docId, docLoc) = InvertedIndex.splitEntityIdPart(hitLocation)
            #Compute which paragraphs to display
            minShowPar = 0
            maxShowPar = 2
            if docLoc.startswith(b"paragraph"):
                paragraphNo = int(docLoc[9:])
                minShowPar = max(0, paragraphNo - 1)
                maxShowPar = min(len(doc[b"paragraphs"]), paragraphNo + 1)
            #Modify documents
            results[hitLocation][b"hitLocation"] = docLoc
            results[hitLocation][b"paragraphs"] = doc[b"paragraphs"][minShowPar:maxShowPar]
        # Measure timing
        timeDiff = (time.time() - startTime) * 1000.0
        print("Document search for %d tokens took %.1f milliseconds" % (len(queryTokens), timeDiff))
        return results

    def uniquifyEntities(self, entities):
        """Remove duplicates from a list of entities (key: ["id"])"""
        seen = set()
        result = []
        for entity in entities:
            itemId = entity[b"id"]
            if itemId in seen: continue
            seen.add(itemId)
            result.append(entity)
        return result

    def performEntitySearch(self, query):
        """
        Search entities. Tokens are not splitted in order to allow simple search
        for multi-token entities like "Biological process"
        """
        results = self.db.searchEntitiesSingleTokenMultiExact([query], level=b"aliases")
        #Return only result array. TODO can't we just use results[query]
        if query not in results:
            return []
        return results[query]

    def filterNERTokens(self, token):
        """
        Filter function to remove stuff that just clutters the display.
        """
        #Short numbers are NOT considered database IDs.
        #NOTE: In reality, pretty much all numbers are Allergome database IDs, e.g. see
        # http://www.allergome.org/script/dettaglio.php?id_molecule=14
        if len(token) <= 5 and token.isdigit():
            return False
        return True

    def performEntityNER(self, query):
        "Search a query text for entity/entity alias hits"
        startTime = time.time()
        tokens = self.nerTokenizer.tokenize(query)
        queryTokens = [s.encode("utf-8") for s in tokens]
        # Search for case-sensitive hits
        searchFN = InvertedIndex.searchSingleTokenMultiExact
        results = searchFN(self.db.entityIdx.index, frozenset(filter(self.filterNERTokens, queryTokens)), level=b"aliases")
        # Results contains a list of tuples (dbid, db) for each hit. dbid is db + b":" + actual ID
        # We only need the actual ID, so remove the DBID prefix (which is required to avoid inadvertedly merging entries).
        # This implies that the DBID MUST contain a colon!
        results =  {k: [(a.partition(b":")[2], b) for (a, b) in v] for k, v in results.items() if v}
        #
        # Multi-token NER
        # Based on case-insensitive entries where only the first token is indexed.
        #
        # TESTING: Multi token NER
        lowercaseQueryTokens = [t.lower() for t in queryTokens]
        t1 = time.time()
        ciResults = searchFN(self.db.entityIdx.index, frozenset(lowercaseQueryTokens), level=b"cialiases")
        t2 = time.time()
        print("TX " + str(t2 - t1))
        for (firstTokenHit, hits) in ciResults.items():
            #Find all possible locations where the full hit could start, i.e. where the first token produced a hit
            possibleHitStartIndices = [i for i, x in enumerate(lowercaseQueryTokens) if x == firstTokenHit]
            #Iterate over all possible
            for hit in hits:
                hitLoc, _, hitStr = hit[1].rpartition(b"\x1D") # Full (whitespace separated) entity name
                if not hitStr: continue #Ignore malformed entries. Should usually not happen
                hitTokens = [t.lower() for t in hitStr.split()]
                numTokens = len(hitTokens)
                #Check if at any possible hit start index the same tokens occur (in the same order )
                for startIdx in possibleHitStartIndices:
                    actualTokens = lowercaseQueryTokens[startIdx : startIdx+numTokens]
                    #Check if the lists are equal. Shortcut for single-token hits
                    if numTokens == 1 or all((a == b for a, b in zip(actualTokens, hitTokens))):
                        #Reconstruct original (case-sensitive) version of the hit
                        csTokens = queryTokens[startIdx : startIdx+numTokens]
                        #NOTE: This MIGHT cause nothing to be highlighted, if the reconstruction
                        # of the original text is not equal to the actual text. This is true exactly
                        # if the tokenizer removes or changes characters besides whitespace in the text.
                        csHit = b" ".join(csTokens)
                        # Emulate defaultdict behaviour
                        if not csHit in results: results[csHit] = []
                        results[csHit].append((hitStr, hitLoc))
        t3 = time.time()
        print("TY " + str(t3 - t2))
        # TODO: Remove results which are subsets of other hits. This occurs only if we have multi-token results
        removeKeys = set() # Can't modify dict while iterating it, so aggregate keys to delete
        for key in results.keys():
            # Ignore single part results
            if any((chr(c).isspace() for c in key)):
                tokens = key.split()
                for token in tokens:
                    # Remove sub-hit in results.
                    # This avoids the possibility of highlighting the smaller hit
                    if token in results:
                        removeKeys.add(token)
        # Remove aggregated keys
        for key in removeKeys:
            del results[key]
        # Result: For each token with hits --> (DBID, Database name)
        # Just takes the first DBID.It is unlikely that different DBIDs are found, but we
        #   can only link to one using the highlighted label
        ret =  {k: (v[0][0], v[0][1]) for k, v in results.items() if v}
        # Measure timing
        timeDiff = (time.time() - startTime) * 1000.0
        print("NER for %d tokens took %.1f milliseconds" % (len(queryTokens), timeDiff))
        return ret


    def onMessage(self, payload, isBinary):
        request = json.loads(payload.decode('utf8'))
        # Perform action depending on query type
        qtype = request["qtype"]
        if qtype == "docsearch":
            results = self.performDocumentSearch(request["term"])
            del request["term"]
            request["results"] = list(results.values())
        elif qtype == "ner":
            results = self.performEntityNER(request["query"])
            del request["query"]
            request["results"] = results
        elif qtype == "metadb":
            # Send meta-database to generate
            request["results"] = metaDB
        elif qtype == "entitysearch":
            request["entities"] = self.performEntitySearch(request["term"])
            del request["term"]
        elif qtype == "getdocuments":
            # Serve one or multiple documents by IDs
            docIds = [s.encode() for s in request["query"]]
            request["results"] = self.db.docIdx.findEntities(docIds)
            del request["query"]
        else:
            print(red("Unknown websocket request type: %s" % request["qtype"], bold=True))
            return # Do not send reply
        #Return modified request object: Keeps custom K/V pairs but do not re-send query
        self.sendMessage(json.dumps(request, default=documentSerializer).encode("utf-8"), False)

    def onClose(self, wasClean, code, reason):
        print("WebSocket connection closed: {0}".format(reason))