Beispiel #1
0
def token(X,
          words_only=False,
          word_normalize=True,
          emoji_normalize=True,
          remove_digits=True,
          lower_case=True,
          stop_words=None):
    '''
        requires Stemming if word_normalize = True
        use pip[env] install stemming 
        '''

    # eyes [nose] mouth | mouth [nose] eyes pattern
    emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
    emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)

    # Keep word only. Digit are consider true Emojis false
    if words_only:
        clean_text = re.sub('[\W]+', ' ', X)
    else:
        clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X),
                                   ''.join(re.findall(emoticon_re, X)))

    # normalize emoji?
    if emoji_normalize:

        clean_text = (re.sub('[\W]+', ' ', X) + ' '.join(
            re.findall(emoticon_re, X)).replace(';', ':').replace('-', ''))

    if remove_digits:
        clean_text = clean_text.translate(str.maketrans('', '', '0123456789'))

    if lower_case:
        clean_text = clean_text.lower()

    if word_normalize:
        try:
            import Stemmer
            stemmer = Stemmer.Stemmer('danish')
            clean_text = ' '.join(stemmer.stemWords(clean_text.split()))
        except ModuleNotFoundError:
            print('Stemmer is not found. Try "pip install pystemmer"')
            print('Words not normalize')
            pass  #Continue with issue

    if stop_words:

        return [word for word in clean_text.split() if word not in stop_words]
    else:
        return clean_text.split()
Beispiel #2
0
def filter_stemmer(words = set([]), lang="pt"):
	'''
	    Invoca a biblioteca que retira de uma palavra todas a partes desnecessárias.
	    
	    @param words: Um set com as palavras para aplicar o stemmer.
	    @return: List com as palavras após o stemmer.
	'''
	
	stemmer = Stemmer.Stemmer(lang) #@UndefinedVariable
	text = []
	for word in words:
		stm = stemmer.stemWord(word)
		if len(stm) > 0:
			text.append(stm.lower())
	return text
Beispiel #3
0
    def __init__(self, index_path=""):
        self.path = index_path

        self.doccount = 0
        self.token_count = 0

        self.init_counts()

        self.ranker = bm25.BM25(index_path=self.path,
                                doccount=self.doccount,
                                tokcount=self.token_count)

        self.stopwords = set()
        self.init_stopwords()

        self.stemmer = Stemmer.Stemmer('english')
Beispiel #4
0
    def __init__(self, index_path):
        super().__init__()

        self.index_path = index_path
        self.categories = sorted(
            ["references", "body", "infobox", "title", "category", "links"])
        self.query_categories = {
            "c:": "category",
            "b:": "body",
            "t:": "title",
            "i:": "infobox",
            "r:": "references",
            "e:": "links"
        }
        self.tokens_dict = self.get_tokens()
        self.stemmer = Stemmer.Stemmer("english")
Beispiel #5
0
def splitstringStemKazStop(str, stoplist):
    words = []
    str = str.lower()
    str = str.replace("ё", "е")
    stemmer = Stemmer.Stemmer('russian')
    # for i in re.split('[;,.,\n,\s,:,-,+,(,),=,/,«,»,\d,!,?,"]',str):
    # re.split("(?:(?:[^а-яА-Я]+')|(?:'[^a-zA-Z]+))|(?:[^a-zA-Z']+)"
    for i in re.split("(?:[^а-я0-9әғқңөұүһі]+)", str):
        if len(i) > 1 and len(i) <= 17:
            if i not in stoplist:
                stemmed = stemmer.stemWord(i)
                if len(stemmed) > 1:
                    words.append(stemmed)
                    # words.append(i) # without stamming

    return words
Beispiel #6
0
    def __init__(self, xml_file_path, index_directory, stop_words_file):

        self.parser = etree.iterparse(xml_file_path, events=("start", "end"))

        # Reading stop words list
        with open(stop_words_file, "r") as fp:
            self.stop_words = fp.readlines()

        # print(self.stop_words)
        self.stop_words_dict = {}
        # self.stop_words = [word.strip("'") for word in self.stop_words]
        for word in self.stop_words:
            self.stop_words_dict[word.split("\n")[0]] = 1

        self.stemmer = Stemmer.Stemmer('english')
        self.postings_dictionary = dict()
        self.index_directory = index_directory
 def __init__(self):
     self.seek_list = None
     self.comment_file = None
     self.index_file = None
     self.symbol_to_encoding_dict = None
     self.cids = None
     self.comment_offsets_cid = None
     self.comment_offsets = None
     self.comment_term_counts = None
     self.comment_csv_reader = None
     self.authors_list = None
     self.articles_list = None
     self.reply_to_index = None
     self.collection_term_count = 0
     self.stemmer = Stemmer.Stemmer('english')
     self.tokenizer = nltk.tokenize.ToktokTokenizer()
     self.report = Report()
Beispiel #8
0
def clean_portuguese_text(text):
    text = clean_text(text)

    stop_words = get_stop_words('pt')
    stop_words = get_stop_words('portuguese')

    stop_words.append("rua")
    stop_words.append("estrada")
    stop_words.append("citada")
    stop_words.append("citado")
    stop_words.append("endereço")
    stop_words.append("endereco")
    stop_words.append("caminho")
    stop_words.append("período")
    stop_words.append("periodo")
    stop_words.append("próximo")
    stop_words.append("proximo")
    stop_words.append("próxima")
    stop_words.append("proxima")
    stop_words.append("mencionado")
    stop_words.append("mencionada")
    stop_words.append("altura")
    stop_words.append("complementa")
    stop_words.append("denuncia")
    stop_words.append("denúncia")
    stop_words.append("diariamente")
    stop_words.append("avenida")
    stop_words.append("município")
    stop_words.append("municipio")

    words = text.split(' ')

    content = ""

    stemmer = Stemmer.Stemmer('portuguese')

    for word in words:
        word = word.lower()
        if word not in stop_words and word.strip():
            content = content + stemmer.stemWord(word.lower()).upper() + " "

    # Removing last space
    content.rstrip()
    content.lstrip()

    return content
Beispiel #9
0
    def __init__(self, language):
        """Initialize a StemNormalizer.

        Args:
            language: a PyStemmer language. These can be seen by
                listing Stemmer.algorithms(), but current options are:
                danish, dutch, english, finnish, french, german,
                hungarian, italian, norwegian, portuguese, romanian,
                russian, spanish, swedish, turkish.

                You can also specify "porter" to get the classic
                Porter stemmer for English.

        """
        super(StemNormalizer, self).__init__()

        self.stemmer = Stemmer.Stemmer(language.lower())
Beispiel #10
0
    def __init__(self,
                 enable_case_folding=True,
                 enable_remove_stop_words=True,
                 enable_stemmer=False,
                 enable_lemmatizer=True,
                 min_length=2):
        self.steps = []
        self.SPLIT_WORDS_PATTERN = re.compile(
            r'\s|\/|\\|\.|\:|\?|\(|\)|\[|\]|\{|\}|\<|\>|\'|\!|\"|\-|,|;|\$|\*|\%|#'
        )
        self.steps.append(self.__split_words)
        if enable_case_folding:
            self.steps.append(self.__case_folding)

        if enable_remove_stop_words:
            self.steps.append(self.__remove_stop_words)
            self.stop_words = {
                'a', 'able', 'about', 'across', 'after', 'all', 'almost',
                'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
                'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could',
                'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every',
                'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her',
                'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in',
                'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
                'likely', 'may', 'me', 'might', 'most', 'must', 'my',
                'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on',
                'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say',
                'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that',
                'the', 'their', 'them', 'then', 'there', 'these', 'they',
                'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
                'were', 'what', 'when', 'where', 'which', 'while', 'who',
                'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your'
            }

        if enable_stemmer:
            self.steps.append(self.__stem)
            self.stemmer = Stemmer.Stemmer('english')

        if enable_lemmatizer:
            self.steps.append(self.__lemmatiza)
            self.lemmatizer = WordNetLemmatizer()

        if min_length:
            self.steps.append(
                lambda words: self.__remove_short_words(words, min_length))
Beispiel #11
0
    def __init__(
        self,
        stem_threshold=STEM_THRESHOLD,
        max_token_length=MAX_TOKEN_LENGTH,
        min_split_length=MIN_SPLIT_LENGTH,
        single_shot=False,
        save_token_style=False,
        attach_upper=True,
        use_nn=False,
        nn_model=None,
    ):
        """
        Initialize a new TokenSplitter.

        :param stem_threshold: We do not stem split parts shorter than or equal to this size.
        :param max_token_length: We cut identifiers longer than this value.
        :param min_split_length: We do not split source code identifiers shorter than this value. \
                                 If you do not want to filter small tokens set min_split_length=1.
        :param single_shot: True if we do not want to join small identifiers to next one. \
            Example: 'sourced.ml.algorithms' → ["sourc", "sourcedml", "algorithm", "mlalgorithm"].\
            If True we have only ["sourc", "algorithm"]. \
        :param save_token_style: value indicating whether yield metadata that can be used to \
                                 reconstruct the initial identifier.
        :param attach_upper: True to attach the last of several uppercase letters in a row to \
                      the next token. Example: 'HTMLResponse' -> ["html", "response"] if True, \
                      'HTMLResponse' -> ["htmlr", "esponse"] if False.
        :param use_nn: value indicating whether to use the Neural Network-based splitter instead \
                       of the heuristics.
        :param nn_model: IdentifierSplitterBiLSTM model UUID to load. None means the most recent.
        """
        self._stemmer = Stemmer.Stemmer("english")
        self._stemmer.maxCacheSize = 0
        self._stem_threshold = stem_threshold
        self._max_token_length = max_token_length
        self._min_split_length = min_split_length
        self._single_shot = single_shot
        self._save_token_style = save_token_style
        self._attach_upper = attach_upper
        self._id_splitter_nn = None
        if use_nn:
            self._init_nn(nn_model)
        if self._save_token_style and not self._single_shot:
            raise ValueError(
                "Only one of `single_shot`/`save_token_style` should be True"
            )
Beispiel #12
0
 def __init__(self,
              stem_threshold=STEM_THRESHOLD,
              max_token_length=MAX_TOKEN_LENGTH,
              min_split_length=MIN_SPLIT_LENGTH,
              single_shot=DEFAULT_SINGLE_SHOT,
              save_token_style=SAVE_TOKEN_STYLE,
              attach_upper=ATTACH_UPPER):
     self._stemmer = Stemmer.Stemmer("english")
     self._stemmer.maxCacheSize = 0
     self._stem_threshold = stem_threshold
     self._max_token_length = max_token_length
     self._min_split_length = min_split_length
     self._single_shot = single_shot
     self._save_token_style = save_token_style
     self._attach_upper = attach_upper
     if self._save_token_style and not self._single_shot:
         raise ValueError(
             "Only one of `single_shot`/`save_token_style` should be True")
Beispiel #13
0
    def transform(self, X):
        self.X = X

        # eyes [nose] mouth | mouth [nose] eyes pattern
        emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
        emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)

        # Keep word only. Digit are consider true Emojis false
        if self.words_only:
            clean_text = self.X.apply(lambda x: (re.sub('[\W]+', ' ', x)))
        else:
            clean_text = self.X.apply(lambda x: ('{}{}'.format(
                re.sub('[\W]+', ' ', x), ''.join(re.findall(emoticon_re, x)))))

        # normalize emoji?
        if self.emoji_normalize:

            clean_text = self.X.apply(lambda x: (re.sub(
                '[\W]+', ' ', x) + ' '.join(re.findall(
                    emoticon_re, x)).replace(';', ':').replace('-', '')))

        if self.remove_digits:
            clean_text = clean_text.apply(
                lambda x: x.translate(str.maketrans('', '', '0123456789')))

        if self.lower_case:
            clean_text = clean_text.str.lower()

        if self.word_normalize:
            try:
                import Stemmer
                stemmer = Stemmer.Stemmer('danish')
                clean_text = clean_text.apply(
                    lambda x: ' '.join(stemmer.stemWords(x.split())))
            except ModuleNotFoundError:
                print('Stemmer is not found. Try "pip install pystemmer"')
                print('Words not normalize')
                pass  #Continue with issue

        if self.token:
            return clean_text.str.split()
        else:
            return clean_text
Beispiel #14
0
class Cleaner(object):
  REJECT_BOTH = {'.', '(', ')', '!', '?', ',', 'num', 'и', 'в', 'с', 'о', 'об', 'от', 'я', 'по', 'на', 'ты', 'он' }
  REJECT_POST = { 'не' }

  stemmer = Stemmer.Stemmer('russian')

  def __init__(self, stemming=True, locale='ru', ngrams_size = 1):
    assert locale == 'ru'
    self._stemming = stemming
    self._ngrams_size = ngrams_size

  def words(self, text):
    cleaned_words = self._clean(text).split(' ')
    if cleaned_words[0]  == '': cleaned_words = cleaned_words[1:]
    if len(cleaned_words) > 0 and cleaned_words[-1] == '': cleaned_words = cleaned_words[:-1]

    words = self._stem(cleaned_words) if self._stemming is True else cleaned_words
    words += self._ngrams(words)

    return words

  def _clean(self, string):
    string = re.sub(r"[ёЁ]",               "е",     string)
    string = re.sub(r"[^А-Яа-я0-9(),!?.]", " ",     string)
    string = re.sub(r"\d+(\.|,)?\d*",      " num ", string)
    string = re.sub(r"\.",                 " . ",   string)
    string = re.sub(r",",                  " , ",   string)
    string = re.sub(r"!",                  " ! ",   string)
    string = re.sub(r"\(",                 " ( ",   string)
    string = re.sub(r"\)",                 " ) ",   string)
    string = re.sub(r"\?",                 " ? ",   string)
    string = re.sub(r"\s{2,}",             " ",     string)
    return string.strip().lower()

  def _stem(self, words):
    return self.stemmer.stemWords(words)

  def _ngrams(self, words):
    if self._ngrams_size == 1: return []

    ngrams = [' '.join(words[i:i + self._ngrams_size]) for i in range(len(words) - self._ngrams_size + 1) if words[i] not in self.REJECT_BOTH and words[i + 1] not in self.REJECT_BOTH and words[i + 1] not in self.REJECT_POST]

    return ngrams
Beispiel #15
0
    def tokenize(self, text, lang, rem_sw, let_stemming):
        sents_text, sents_offset, sents_start_end, sent_based_voc= [],[],[],{}
        text = text.replace(chr(0), ' ')
        text = text.replace('*', ' ')
        text = text.replace('(', ' ')
        text = text.replace(')', ' ')
        text = text.replace('|', ' ')
        text = text.replace('\ufeff', ' ')

        sent_detector = nltk.data.load('tokenizers/punkt/' + lang + '.pickle')
        stemmer = Stemmer.Stemmer(lang)
        word_detector = nltk.TreebankWordTokenizer()
        sent_spans = sent_detector.span_tokenize(text)
        if rem_sw == 0:
            stopwords = []
        elif rem_sw == 1:
            stopwords = copy.deepcopy(self.langstopwords[lang])
        sents_vect = []
        for span in sent_spans:  # For each sentence
            sent_dic = {}
            sents_text.append(text[span[0]:span[1]].lower())
            for word in word_detector.tokenize(
                    sents_text[-1]):  # for each word in the sentence
                if len(word) > 2 and word not in stopwords:
                    if let_stemming == 1:
                        word_pp = stemmer.stemWord(word)
                    else:
                        word_pp = word
                else:
                    continue
                if word_pp in sent_dic:
                    sent_dic[word_pp] += 1
                else:
                    sent_dic[word_pp] = 1
                    if word_pp in sent_based_voc:
                        sent_based_voc[word_pp] += 1
                    else:
                        sent_based_voc[word_pp] = 1

            sents_vect.append(sent_dic)
            sents_offset.append([span[0], span[1] - span[0]])
            sents_start_end.append([span[0], span[1]])
        return sents_text, sents_vect, sents_offset, sents_start_end, sent_based_voc
def tokenize(text, Type):
    global STOP_WORDS
    global current_token
    tokens = re.split(r'[^A-Za-z0-9]+', text)
    length = len(tokens)
    current_token += length
    stemmer = Stemmer.Stemmer('english')
    if Type == 1:
        if length > 0 and tokens[0] == 'redirect':
            return
        if length > 1 and tokens[1] == 'redirect':
            return
    for token in tokens:
        cur_token = stemmer.stemWord(token.lower().casefold())
        if cur_token != "" and cur_token not in STOP_WORDS and token.lower(
        ) not in STOP_WORDS:
            if cur_token not in index:
                index[cur_token] = [0, 0, 0, 0, 0, 0]
            index[cur_token][Type] += 1
    def test_inaugural(self):
        # preparing data
        usprez = shorttext.data.inaugural()
        docids = sorted(usprez.keys())
        usprez = [' '.join(usprez[docid]) for docid in docids]
        usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez})
        usprezdf = usprezdf[['yrprez', 'speech']]

        stemmer = Stemmer.Stemmer('english')

        # preprocesser defined
        pipeline = [
            lambda s: re.sub('[^\w\s]', '', s),
            lambda s: re.sub('[\d]', '', s), lambda s: s.lower(),
            lambda s: ' '.join([
                stemmer.stemWord(token)
                for token in shorttext.utils.tokenize(s)
            ])
        ]
        txtpreprocessor = shorttext.utils.text_preprocessor(pipeline)

        # corpus making
        docids = list(usprezdf['yrprez'])
        corpus = [
            txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']
        ]

        # making DTM
        dtm = shorttext.utils.DocumentTermMatrix(corpus,
                                                 docids=docids,
                                                 tfidf=True)

        # check results
        self.assertEqual(len(dtm.dictionary), 5252)
        self.assertAlmostEqual(
            dtm.get_token_occurences(stemmer.stemWord('change'))['2009-Obama'],
            0.013937471327928361)
        numdocs, numtokens = dtm.dtm.shape
        self.assertEqual(numdocs, 56)
        self.assertEqual(numtokens, 5252)
        self.assertAlmostEqual(dtm.get_total_termfreq('government'),
                               0.27875478870737563)
Beispiel #18
0
    def __init__(self, outputdir, statfile, iter):
        self.outputdir = outputdir
        self.statfile = statfile
        self.iter = iter

        self.createOutputDir()
        # porter = PorterStemmer()
        # lancaster=LancasterStemmer()
        self.stemer = Stemmer.Stemmer('english')
        # self.englishStemmer=SnowballStemmer("english")
        self.wCount = {}
        self.storeStem = {}
        self.max = int(int(resource.getrlimit(resource.RLIMIT_NOFILE)[0]) / 2)
        if self.max < 1:
            self.max = 200
        self.stop_words = set(stopwords.words('english'))
        self.wStr = {}
        self.curDocID = 0
        self.docID = 0
        self.count = 0
Beispiel #19
0
def processQuery(query):
  #lowerCase
  query=query.lower()
  
  #remove space unnecessary characters
  re6 = re.compile(r'[\_]', re.DOTALL)
  query = re6.sub(' ', query)
  query = re.findall("\d+|[\w]+", str(query))
  
  #remove stopwords
  stop_word = set(stopwords.words('english'))
  query = [w for w in query if w not in stop_word]

  #stemmize
  stemmer=Stemmer.Stemmer('english')
  stemmed_data=[]
  for words in query:
    stemmed_data.append(stemmer.stemWord(words))
  query=stemmed_data
  return query
    def improvedTokenizer(docdict):

        stopwords = []

        #ps = PorterStemmer()
        s = Stemmer.Stemmer('porter')

        #read stopword_file
        fs = open("snowball_stopwords_EN.txt", 'r')
        for line in fs:
            stopwords.append(line.strip())

        #split each dictionary entry (of key TI) by word
        text = docdict["TI"]

        #creates list of words not in stopwords
        text = list(filter(lambda word: word not in stopwords, text))
        docdict["TI"] = s.stemWords(text)

        return docdict
Beispiel #21
0
    def stem_data(self):
        """Stems the data, using Porters algorithm"""

        stemmer = Stemmer.Stemmer('english')

        # The stemming object

        def stem_string(string):
            """Input a string, returns a string with the 
            words replaced by their stemmed equivalents"""
            stemmed_list = []
            for word in string.split():
                stemmed_word = stemmer.stemWord(word)
                stemmed_list.append(stemmed_word)

            stemmed_string = " ".join(stemmed_list)
            return stemmed_string

        self.body = stem_string(self.body)
        self.subject = stem_string(self.subject)
Beispiel #22
0
def getQueries():
    global userQueries, stemQueries
    doc = etree.parse('topics.xml', parser=etree.XMLParser())
    root = doc.getroot()

    for child in root.iter('topic'):
        query = child.find('query').text.strip()
        query = re.split(r'\W+(\.?\W+)*', query,
                         flags=re.IGNORECASE)  # same Regex from tokenizing
        userQueries[int(child.attrib['number'])] = query

    stopWords = getStopWords()
    stemmer = Stemmer.Stemmer('english')
    for id in sorted(userQueries):
        stemQueries[id] = []  # stemming using query id
        for query in userQueries[id]:
            if query and query is not None and query not in stopWords:
                token = query.lower()
                stem = stemmer.stemWord(token)
                stemQueries[id].append(stem)
Beispiel #23
0
    def __init__(self):

        self.stemmer = Stemmer.Stemmer('english')

        # -------------------------------- Read and store irregular nouns

        self.irregularNounsPlural = []
        self.irregularNouns = []

        irregularNounsPluralFile = open("WordsList/irregularNounsPlural.txt",
                                        "r")
        irregularNounsFile = open("WordsList/irregularNouns.txt", "r")

        for pluralNoun in irregularNounsPluralFile:

            self.irregularNounsPlural.append(pluralNoun.replace('\n', ''))

        for noun in irregularNounsFile:

            self.irregularNouns.append(noun.replace('\n', ''))

        # ------------------------- Read and store irregular verb endings

        self.irregularEndings = []

        irregularEndingsFile = open("WordsList/irregularEndings.txt", "r")

        for suffix in irregularEndingsFile:

            self.irregularEndings.append(suffix.replace('\n', ''))

        # --------------------------------------------- Remove stop words

        self.stopList = []

        stopListFile = open("WordsList/stop.txt", "r")

        for stopWord in stopListFile.readlines():

            stop_word = stopWord.strip()
            self.stopList.append(stop_word)
Beispiel #24
0
def compare_stemmers(algorithm, words):
    """
    Make sure pystemmer and purestemmer return the same stems.

    ``algorithm`` is the name of the algorithm to be tested and
    ``words`` is a list of input words.
    """
    py = Stemmer.Stemmer(algorithm)
    pure = purestemmer.Stemmer(algorithm)
    for word in words:
        variants = _get_variants(word)
        for variant in variants:
            py_stem = py.stemWord(variant)
            pure_stem = pure.stemWord(variant)
            assert py_stem == pure_stem, (
                'Different output for %r: pystemmer returned %r, ' +
                'purestemmer returned %r.' % (variant, py_stem, pure_stem))
            assert type(py_stem) == type(pure_stem), (
                'Different output types for %r: pystemmer returned %s, ' +
                'purestemmer returned %s.' %
                (variant, type(py_stem), type(pure_stem)))
Beispiel #25
0
    def get_data_from_file(self):
        stemmer = Stemmer.Stemmer('english')
        infile = open(self.filename, "r")
        self.data = json.load(infile)
        infile.close()

        for ids in self.data:
            tweet = self.data[ids][u'tags']
            tweet_text = [words[0] for words in tweet]
            # print tweet_text
            clean_tweet = ' '.join(tweet_text)
            regex_form = '^rt\s+|@\w+:*|https?://[\w\.\/]*'
            clean_tweet = re.sub(regex_form, '', clean_tweet)
            clean_tweet = [stemmer.stemWord(x) for x in clean_tweet.split()]

            for item in clean_tweet:
                self.unigram_vocab[item] += 1

            for item in list(nltk.bigrams(clean_tweet)):
                self.bigram_vocab[item] += 1

            for item in list(nltk.trigrams(clean_tweet)):
                self.trigram_vocab[item] += 1
            # break
        temp = [k for k, v in self.unigram_vocab.iteritems() if v >= 5]
        self.unigram_vocab = temp

        temp = [k for k, v in self.bigram_vocab.iteritems() if v >= 5]
        self.bigram_vocab = temp

        temp = [k for k, v in self.trigram_vocab.iteritems() if v >= 3]
        self.trigram_vocab = temp

        self.features = self.unigram_vocab + self.bigram_vocab + self.trigram_vocab

        for index, item in enumerate(self.features):
            self.featureIndex[item] = index

        # infile.close()
        print "Finished composing the Lexical Features"
def tf_vector(tweet):
	"""
	Transform a string into a Term-Frecuency dictionary
	:param tweet: Text to process
	:param stopwords: A list of string of stopwords
	:param emoticons: A list of string of emoticons (see __main__ in this script)
	:param emojis: A list of string of emoticons (see __main__ in this script)
	:return: a dict object in the form {term=count}, with all terms preprocessed
	"""
	path = os.path.dirname(os.path.abspath(__file__))
	stopwords = open(os.path.join(path,DATAPATH,STOPWORDS)).read().splitlines()
	emoticons = open(os.path.join(path,DATAPATH,EMOTICONS)).read().splitlines()
	emoj = pd.read_csv(os.path.join(path,DATAPATH,EMMOJIS))
	emojis = list(emoj['emoji'])
 
	token_list = ['URL', 'EMAIL', 'MENTION', 'HASHTAG', 'NUMBER', 'EMOTICON', 'EMOJI']

	x = tweet
	x = re.sub("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", " URL ", x)
	x = re.sub("^[_A-Za-z0-9-\\\\+]+(\\\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9-]+(\\\\.[A-Za-z0-9]+)*(\\\\.[A-Za-z]{2,})$", " EMAIL ", x)
	x = re.sub("@[A-Za-z0-9]+", " MENTION ", x)
	x = re.sub("#[A-Za-z0-9]+", " HASHTAG ", x)
	x = re.sub("\\d+(\\.\\d*)?°?", " NUMBER ", x)
	for em in emoticons:
		x = x.replace(em, ' EMOTICON ')
	for ej in emojis:
		x = x.replace(ej, ' EMOJI ')
	x = re.sub(u'['u'\U0001F300-\U0001F64F'u'\U0001F680-\U0001F6FF'u'\u2600-\u26FF\u2700-\u27BF]+', ' EMOJI ', x)
	x = re.sub("[\\\"\\$%&@\\.,:;\\(\\)¿\\?`+\\-_\\*=!¡\\\\/#{}\\[\\]]", " ", x)
	x = re.sub("\\s+", " ", x)
	x = x.strip()
 
	words = x.split(' ')
	words_nonstop = [w for w in words if not w in stopwords]
	words_nonstop_lower = [w.lower() if not w in token_list else w for w in words_nonstop]
 
	stemmer = Stemmer.Stemmer('spanish')
	words_nonstop_lower_stemmed = stemmer.stemWords(words_nonstop_lower)
 
	return dict(Counter(words_nonstop_lower_stemmed))
Beispiel #27
0
 def wildcard_search(self, word):
     if word.find("*") == -1:
         # No wildcard
         try:
             stemmer = Stemmer.Stemmer('spanish')
             stemmed_word = stemmer.stemWord(word)
             return list([self._btree[stemmed_word]])
         except KeyError:
             return None
     elif word[-1] == "*":
         return list(
             self._btree.values(min=word[:-1] + self._alphabet[0],
                                max=word[:-1] + self._alphabet[26]))
     elif word[0] == "*":
         print("Desde %s hasta %s" % (word[::-1][:-1] + self._alphabet[0],
                                      word[::-1][:-1] + self._alphabet[26]))
         return list(
             self._reverse_btree.values(
                 min=word[::-1][:-1] + self._alphabet[0],
                 max=word[::-1][:-1] + self._alphabet[26]))
     else:
         return None
    def get_translation_and_statistics_parse_stemming_ext_snowball(
            self, snowball_language_shortenning, min_occurences=0):
        lifo_stack = []
        stemmers = dict()
        for language_shortening in snowball_language_shortenning:
            stemmers[language_shortening] = Stemmer.Stemmer(
                language_shortening)

        self.load_stop_words(snowball_language_shortenning)
        self.prepare_statistic_structures(snowball_language_shortenning)

        if min_occurences < 2:
            self.parse_stemmer_ext_snowball(lifo_stack, self.root_node,
                                            snowball_language_shortenning,
                                            stemmers)
        else:
            print("SUCCESS")
            self.parse_stemmer_ext_min_snowball(lifo_stack, self.root_node,
                                                snowball_language_shortenning,
                                                min_occurences, stemmers)

        self.print_statistics(snowball_language_shortenning)
Beispiel #29
0
 def convert_src_vector(self, src_vect, lang, rem_sw, let_stemming,
                        fs_knword):
     res_vect, res_voc = [], {}
     if rem_sw == 0:
         stopwords = []
     elif rem_sw == 1:
         stopwords = copy.deepcopy(self.langstopwords['english'])
     stemmer = Stemmer.Stemmer('english')
     for seg_vect in src_vect:
         temp_res_vect = {}
         for word, freq in seg_vect.items():
             # trans_w_lst= self.blc.get_nearest_token(word, lang, fs_knword)
             # if trans_w_lst:
             #     for trans_w,score in trans_w_lst.items():
             #         if trans_w not in stopwords:
             #             if let_stemming==0:
             #                 temp_res_vect[trans_w]=freq
             #             else:
             #                 trans_w=stemmer.stemWord(trans_w)
             #                 temp_res_vect[trans_w] = freq
             trans_w = self.blc.get_nearest_token(word, lang, fs_knword)
             if trans_w:
                 if trans_w not in stopwords:
                     if let_stemming == 0:
                         temp_res_vect[trans_w] = freq
                     else:
                         trans_w = stemmer.stemWord(trans_w)
                         temp_res_vect[trans_w] = freq
             else:
                 trans_w = word
                 temp_res_vect[trans_w] = freq
         res_vect.append(temp_res_vect)
         for tw in temp_res_vect.keys():
             if tw in res_voc:
                 res_voc[tw] += 1
             else:
                 res_voc[tw] = 1
     return res_vect, res_voc
Beispiel #30
0
def words_filter1(sourceword):
    """根据WordNet过滤, 这个是没有用停用词过滤的,而是对Wordnet词典进行查找,找到就加入,没有找到的话先还原下,然后再查,仍然没有则丢弃
    """
    stemmer = Stemmer.Stemmer('english')
    #print stemmer.stemWords(sourceword)  直接用stemmer,有些词不对,,,,
    #return words_filter(stemmer.stemWords(sourceword), StopWordList().words)
    stopwordlist = StopWordList().words
    
    purewords = []
    for oneword in sourceword:
        
        for d in Dictionaries:
            part = d.pos
            try:
                getWord(oneword, part)
                purewords.append(oneword)
                # 只要查到一种就退出
                break
            except:
                pass
        else:
            # 这里再次用stemmer试探还原词干,,,比如说查找dogs
            oneword = stemmer.stemWord(oneword)
            for d in Dictionaries:
                part = d.pos
                try:
                    getWord(oneword, part)
                    purewords.append(oneword)
                    break
                except:
                    pass
            else:
                # 还原后还是找不到,那就没办法了...如果这个词不在停用词表中的话,那还是保存起来
                pass
                #if oneword not in stopwordlist:
                #    purewords.append(oneword)
    return purewords
    return words_filter(purewords, StopWordList().words)