def parsing(self, line):

        line = line[:-1]  # 문장 맨 뒤 '\n' 자르기

        # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장
        tokenizer = RegexpTokenizer("\t", gaps=True)
        tokens = tokenizer.tokenize(line)

        count = 0

        for token in tokens:
            count += 1

            if count == 1:
                self.label = token
            elif count == 2:
                self.operator = token
            elif count == 3:
                opnd = token
                tokenizer = RegexpTokenizer(",", gaps=True)
                opnds = tokenizer.tokenize(opnd)
                i = 0
                for op in opnds:
                    self.operand.append(op)
                    i += 1
            elif count == 4:
                self.comment = token
            else:
                print("[TokenTable.py] parsing() error")
def create_sents(toks):
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    s = RegexpTokenizer(wordre).tokenize(toks)
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    for sentence in s:
        RegexpTokenizer(wordre).tokenize(sentence)
    return toks
Beispiel #3
0
    def tokenize(self, text):
        """
           tokenize text into a list of Token objects

            :param text: text to be tokenized (might contains several sentences)
            :type text: str
            :return: List of Token objects
            :rtype: list(Token)
        """
        tokens = []

        if self.tokenizer_type == "SpaceTokenizer":
            operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
            operator = WhitespaceTokenizer()
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "PTBTokenizer":
            ptb_tokens = word_tokenize(text)
            counter = 0
            for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
                new_token = Token(counter, token, span[0], span[1])
                counter += 1
                tokens.append(new_token)

        return tokens
Beispiel #4
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
Beispiel #5
0
    def parse(self, fname):
        """
        Парсинг текста файла
        :param fname: имя файла
        :return: (<имя_файла>, тошнота, мошенничество)
        """
        density, fraud = 0, 0
        with codecs.open(fname, "r", encoding="utf-8") as f:
            text = f.read()
        tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+")
        txt_list = tknz.tokenize(text)
        if txt_list:
            for i, word in enumerate(txt_list):
                new_word = self.check_word(word)
                if new_word:
                    txt_list[i] = new_word
                    fraud += 1

            txt_list = [
                word.lower() for word in txt_list
                if not (word.lower() in self.sw)
            ]
            stemmer_ru = RussianStemmer()
            txt_list = [
                stemmer_ru.stem(token.lower()) for token in txt_list
                if len(token) > 1
            ]
            dict_w = Counter(txt_list)
            top5 = heapq.nlargest(5, dict_w, key=dict_w.get)
            top5_count = sum([dict_w[word] for word in top5])
            density = top5_count / len(txt_list)
        # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке
        # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать
        # готов обсуждать этот критерий, возможно исправить каким то образом
        return fname, density, fraud > 2
Beispiel #6
0
def get_emails_sent_by_person_list(emails_df):
    tokenizer = RegexpTokenizer(r'(?u)\b\w\w+\b')
    emails_df['subject_wc'] = emails_df['subject'].map(
        lambda x: len(tokenizer.tokenize(x)))
    emails_df['content_wc'] = emails_df['content'].map(
        lambda x: len(tokenizer.tokenize(x)))

    grouped_by_people = emails_df.groupby('from').agg({
        'content': 'count',
        'subject_wc': 'mean',
        'content_wc': 'mean',
    })

    grouped_by_people.rename(columns={
        'content': 'N emails',
        'subject_wc': 'Subject word count',
        'content_wc': 'Content word count'
    },
                             inplace=True)

    grouped_by_people.sort_values(by=['N emails'], ascending=False)

    file_path_send = file_path = os.path.join(dir_path,
                                              'results/emails_by_person.csv')

    grouped_by_people.to_csv(file_path_send)
Beispiel #7
0
 def no_stop_tokens(self,text):
     tokens = []
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     tokens += tokenizer.tokenize(text)
     #stemmer = nltk.stem.snowball.EnglishStemmer()
     #tokens = map(lambda x: stemmer.stem(x),tokens)
     return tokens
Beispiel #8
0
def token_words(lyric):
    """
    in: lyric(element of row['text'])
    take whole lyric and convert it into list of words for analysis
    apply few cleaning processes tot remove punctuation & stopwords & errors(Minor focus on this)
    return: list of words in the lyric
    """
    lyric = lyric.lower()
    """
     tokenizer that will tokenize lyric('text') into words without punctuation
     it will split aphostrophe words into 2 seperate words but its okay
     as most of the time words with aphostrophe are non-main verbs(would,should,etc)
     non-main verbs are usually insignificant in most of the context and will be deleted
     e.g : would've = would ve but this is fine as we know stopwords will remove ve
     tweetTokenizer was producing very irregular words in lyric such as (8, numbers and was dist
    """
    #apply tokenizer
    tokenizer1 = RegexpTokenizer("[a-z]+")
    words = tokenizer1.tokenize(lyric)
    #convert list of stopwords to set of stopwords for faster access
    en_stopwords = set(stopwords.words('english'))
    #we remove stopwords in words
    #and add few words that were in the words_lyric for cleaner process
    en_stopwords.add('chorus')
    #single letters aren't really words :)
    for c in ascii_lowercase:
        en_stopwords.add(c)

    words_lyric = [w for w in words if not w in en_stopwords]

    #postProcess of words_lyric
    words_lyric = postProcess(words_lyric)

    return words_lyric
Beispiel #9
0
def tokenize_sentence(text, preprocess=True):
    '''
    Tokenize the given sentence and applies preprocessing if requested 
    (conversion to lower case and digit substitution).
    '''
    if preprocess:
        text = re.sub(r'\d', '9', text.lower())

    tokenizer_regexp = ur'''(?ux)
    ([^\W\d_]\.)+|                # one letter abbreviations, e.g. E.U.A.
    \d{1,3}(\.\d{3})*(,\d+)|      # numbers in format 999.999.999,99999
    \d{1,3}(,\d{3})*(\.\d+)|      # numbers in format 999,999,999.99999
    \d+:\d+|                      # time and proportions
    \d+([-\\/]\d+)*|              # dates. 12/03/2012 12-03-2012
    [DSds][Rr][Aa]?\.|            # common abbreviations such as dr., sr., sra., dra.
    [Mm]\.?[Ss][Cc]\.?|           # M.Sc. with or without capitalization and dots
    [Pp][Hh]\.?[Dd]\.?|           # Same for Ph.D.
    [^\W\d_]{1,2}\$|              # currency
    (?:(?<=\s)|^)[\#@]\w*[A-Za-z_]+\w*|  # Hashtags and twitter user names
    -[^\W\d_]+|                   # clitic pronouns with leading hyphen
    \w+([-']\w+)*|                # words with hyphens or apostrophes, e.g. não-verbal, McDonald's
    -+|                           # any sequence of dashes
    \.{3,}|                       # ellipsis or sequences of dots
    \S                            # any non-space character
    '''
    tokenizer = RegexpTokenizer(tokenizer_regexp)

    return tokenizer.tokenize(text)
Beispiel #10
0
	def __init__(self):
		self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations
		if ver==2:
			self.stemmer = SnowballStemmer("english")         #using stemmed version of words
		elif ver==1:
			self.stemmer = LancasterStemmer()	
		else:
			self.stemmer = PorterStemmer()
Beispiel #11
0
def emailExtractor(sentence, word):
    # https://stackoverflow.com/questions/39777806/how-to-update-nltk-package-so-that-it-does-not-break-email-into-3-different-toke
    pattern = r'\S+@[^\s.]+\.[a-zA-Z]+|\w+|[^\w\s]'
    tokeniser = RegexpTokenizer(pattern)
    for w in tokeniser.tokenize(sentence):
        if re.search('^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$', w):
            context["email"] = w
            return True
    return False
Beispiel #12
0
 def tokenize(self,text):
     tokens = []
     tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)')
     #tokens += tokenizer.tokenize(self.title.lower())
     tokens += tokenizer.tokenize(text.lower())
     tokens = filter(lambda x: x not in STOP_WORDS and len(x) >1 ,tokens)
     #stemmer = nltk.stem.snowball.EnglishStemmer()
     #tokens = map(lambda x: stemmer.stem(x),tokens)
     return tokens
Beispiel #13
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
def tokenize_large_text_file(file_name, file_location):
    tokens_array = set()
    file = os.path.join(file_location, file_name)
    with open(file,'r') as file:
        for line in file:
            tokenizer = RegexpTokenizer('\s+', gaps=True)
            tokens_array.update(tokenizer.tokenize(line))
    tokens_dict = {str(file_name) + " - {} tokens".format(file_name): list(tokens_array)}
    with open('tokens taken from - {} - .json'.format(str(file_name)), 'w') as f:
        json.dump(tokens_dict, f)
    def __init__(self):
        nltk.download("punkt")
        nltk.download("stopwords")
        nltk.download("wordnet")  # lemmatization

        self._tokenizer = RegexpTokenizer(r"\w+")
        self._stop_words = set(stopwords.words("english"))
        # self._stemmer = nltk.stem.SnowballStemmer("english")
        self._lemmatizer = nltk.wordnet.WordNetLemmatizer()
        self._vocabulary = set()
def getTokenCount(description):
    tokens = RegexpTokenizer(r'\w+').tokenize(description)
    tokens = [w.lower() for w in tokens]
    stopwords = yaml.load(open("backend/nltk/stopwords.yaml", "r"))
    tokens = [w for w in tokens if not w in stopwords]
    tokens = [w for w in tokens if len(w) > 2]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]
    tokenCount = collections.Counter(tokens)
    return tokenCount
def prep_string(s):
    s = re.sub("\n", " ", s)
    s = re.sub("\>", " ", s)
    #toks = Token(TEXT=s, LOC=CharSpanLocation(0, len(s), 's'))
    wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n"
    toks = RegexpTokenizer(wordre).tokenize(s)
    word_list = []
    for tok in toks:
        word_list.append(tok)
    return word_list
 def getDoc_set():
     tokenizer = RegexpTokenizer(r'\w+')
     for doc in getCorpus.corpus_doc:
         #             print type(doc)
         raw = doc.lower()
         tokens = tokenizer.tokenize(raw)
         en_stop = get_stop_words("en")
         stopped_tokens = [i for i in tokens if i not in en_stop]
         p_stemmer = PorterStemmer()
         texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens]
         getCorpus.doc_set.append(texts)
Beispiel #19
0
    def calcLocation(self, token):
        len = self.instTab.searchFormat(token.operator)
        if len > 0:
            return len
        else:
            if token.operator == "RESW" or token.operator == "WORD":
                len = 3

            elif token.operator == "RESB":
                len = int(token.operand[0])

            elif token.operator == "BYTE":
                len = 1

            elif token.operator == "LTORG":
                len = self.literalTab.literalCount
                self.literalTab.setLiteralCount(0)
                count = 0
                for litCheck in self.literCheck:
                    if litCheck[1:2] == 'C':
                        len = 3
                    else:
                        len = 1
                    self.literalTab.modifyLiteral(
                        litCheck, TokenTable.locCount + (count * len))
                    count += 1

            elif token.operator == "END":
                len = self.literalTab.literalCount
                self.literalTab.setLiteralCount(0)
                count = 0
                for litCheck in self.literCheck:
                    self.literalTab.modifyLiteral(litCheck, token.location)
                    count += 1

            elif token.operator == "EQU":
                if token.operand[0] == "*":
                    len = 0

                else:

                    tokenizer = RegexpTokenizer("-", gaps=True)
                    tokens = tokenizer.tokenize(token.operand[0])

                    value1 = self.symTab.search(tokens[0])
                    value2 = self.symTab.search(tokens[1])
                    len = value1 - value2
                    self.symTab.modifySymbol(token.label, len)
                    len = 0

            else:
                len = -1

        return len
Beispiel #20
0
def get_search_terms(search_text: str):
    # Get any search terms

    tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False)

    # Respect quoted strings
    search_terms = tr.tokenize(search_text)
    if len(search_terms) == 0:
        solr_search_terms = "*"
    else:
        solr_search_terms = ' '.join(search_terms)
    return solr_search_terms
Beispiel #21
0
def get_search_terms(request: HttpRequest):
    # Get any search terms

    tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False)
    search_text = str(request.GET.get('search_text', ''))
    # Respect quoted strings
    search_terms = tr.tokenize(search_text)
    if len(search_terms) == 0:
        solr_search_terms = "*"
    else:
        solr_search_terms = ' '.join(search_terms)
    return solr_search_terms
Beispiel #22
0
    def __init__(self, corpus, tokenize_str, delimiter, n, max_length):
        self.corpus = corpus
        self.tokenizer = RegexpTokenizer(tokenize_str)
        self.delimiter = delimiter
        self.n = n
        self.max_length = max_length

        # use set methods to set these variables
        self.tokenized_corpus = []
        self.startList = []
        self.ngramDict = defaultdict(list)
        self.unigramDict = defaultdict(list)
        self.set_tokenized_corpus()
        self.set_ngrams()
def sentence_length(corpus):
    too_long_sentences = []
    total_sentences = 0

    tokenizer = RegexpTokenizer("\s+", gaps=True)

    articles = preprocessing(corpus)
    for article in articles:
        sentences = sent_tokenize(article)
        total_sentences += len(sentences)
        for sentence in sentences:
            words = tokenizer.tokenize(sentence)
            if (len(words) > 25):
                too_long_sentences.append((sentence, len(words)))

    return (1 - len(too_long_sentences) / total_sentences) * 100
 def getNew_object(docs, newDoc_object):
     tokenizer = RegexpTokenizer(r'\w+')
     for doc in docs:
         #             print type(doc[1])
         raw = doc[1].lower()
         tokens = tokenizer.tokenize(raw)
         en_stop = get_stop_words("en")
         stopped_tokens = [i for i in tokens if i not in en_stop]
         p_stemmer = PorterStemmer()
         texts = {}
         #             texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens]
         #             print texts
         for i in stopped_tokens:
             texts[p_stemmer.stem(i).encode('utf-8')] = texts.get(
                 p_stemmer.stem(i).encode('utf-8'), 0) + 1
         newDoc_object.append(texts)
    def _getWordLists(self):

        # tokenize sentences
        #wordLists = map(lambda s: WordPunctTokenizer().tokenize(s), self.sentenceList)
        #wordLists = map(lambda s: PunktWordTokenizer().tokenize(s), self.sentenceList)
        wordLists = map(lambda s: RegexpTokenizer("\w+").tokenize(s),
                        self.sentenceList)

        # remove stopwords
        stopWords = stopwords.words('english')
        wordLists = map(
            lambda wlist: filter(lambda w: w not in stopWords, wlist),
            wordLists)

        # use stemmer
        #stemmer = PorterStemmer()
        #wordLists = map(lambda wlist: map(lambda w: stemmer.stem(w), wlist), wordLists)

        return wordLists
Beispiel #26
0
    def __init__(self,
                 root,
                 fields=DOC_PATTERN,
                 sent_pattern=SENT_PATTERN,
                 encoding='utf8',
                 **kargs):
        """
        :param root: corpusが入っているdir
        :param fields: 対象となるcorpus
        :param encoding:
        """

        PlaintextCorpusReader.__init__(
            self,
            root,
            fields,
            word_tokenizer=JanomeTokenizer(),
            sent_tokenizer=RegexpTokenizer(sent_pattern),
            encoding=encoding)
 def get_special_text_tokeniser(self):
     """
     @deprecated
     Customised NLTK Regex Tokeniser for special cases: e.g., 3rd, 2nd,1-23-4562, 425-12-3456, wal-mart
     TODO: try to use solr StandardTokenizer 
     """
     '''
     special_text_token_pattern=r""" (?x) # set flag to allow verbose regexps
                 ([A-Z]\.)+     # abbreviations, e.g. U.S.A.
                 |(\$)?\d+(\.\d+)?%?[a-zA-Z0-9]* # currency and percentages, $12.40, 50%, and mix of number and characters, 3rd, 2nd
                 |\w+(-\w+)*     # words with internal hyphens
                 #|[a-zA-Z0-9]+  # 
                 |'s # POS
                 |\.\.\.         # ellipsis
                 |[][.,;"'?():*\-_/\\@&']    # separate special character tokens (punctuations)
                 """
     '''
     pattern = r'''(?x) (?:[A-Z]\.)+|\d+(?:\.\d+)?%?|\w+(?:[-']\w+)*|(?:[.,;"'?():*\-_/\\@&'])'''
     return RegexpTokenizer(pattern)
    def parsing(self, line):
        line = line[:-1] #문장 맨 뒤 '\n' 자르기
        # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장
        tokenizer = RegexpTokenizer("\t", gaps=True)
        tokens = tokenizer.tokenize(line)

        count = 1
        for token in tokens :
            if count == 1 :
                self.instruction = token
            elif count == 2 :
                self.format = int(token)
            elif count == 3 :
                self.opcode = int(token, 16)
            elif count == 4 :
                self.numberOfOperand = int(token)
            else :
                print("[InstTable.py] parsing() error")

            count += 1
Beispiel #29
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
Beispiel #30
0
def tokenizeArticle(article):
    # wordTokens = word_tokenize(article)
    tokenizer = RegexpTokenizer(r'\w+')
    wordTokens = tokenizer.tokenize(article)
    #strings with no letters
    pattern = re.compile("^[\W\s_0-9]+$")
    filteredTokens = [
        token for token in wordTokens if not pattern.match(token)
    ]
    #remove punctuation
    filteredTokens = [
        token for token in filteredTokens if not token in string.punctuation
    ]
    #remove empty strings
    filteredTokens = [
        token for token in filteredTokens
        if not token == "''" and not token == '``'
    ]
    #remove numbers
    filteredTokens = [
        token for token in filteredTokens if not is_number(token)
    ]
    #remove stopwords 30
    filteredTokens = [
        token for token in filteredTokens if not token in stopwords
    ]
    #remove stopwords 150
    # filteredTokens = [token for token in filteredTokens if not token in stopwords[30:151]]

    #case folding
    filteredTokens = [token.lower() for token in filteredTokens]
    #stemming
    # Stemming = PorterStemmer()
    # filteredTokens = [Stemming.stem(token) for token in filteredTokens]

    #remove digits
    filteredTokens = [token for token in filteredTokens if not token.isdigit()]
    return filteredTokens