class WordTokenizer(object):
    def __init__(self, stemmer='porter'):
        self.stemmer = stemmer
        if stemmer == 'wordnet':
            self.wnl = WordNetLemmatizer()
        if stemmer == 'porter':
            self.wnl = PorterStemmer()
        if stemmer == 'snowball':
            self.wnl = SnowballStemmer('english')

    def __call__(self, doc):
        if self.stemmer == 'wordnet':
            return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
        else:
            return [self.wnl.stem(t) for t in word_tokenize(doc)]
Esempio n. 2
0
class Normalizer:
    def __init__(self, norm: str, language: str):
        self.norm = norm
        self.language = language
        if self.norm == "lemmatization":
            # PyMystem3 не поддерживает английский
            # NLTK.WordNetLemmatizer не поддерживает русский
            if self.language == "ru":
                self.alg = Mystem()
            elif self.language == "en":
                self.alg = WordNetLemmatizer()
        # Стемминг
        elif self.norm == "stemming":
            self.alg = SnowballStemmer(expand_language(self.language))
        else:
            raise ValueError(
                "{} is not supported. "
                "Available options: 'lemmatization, 'stemming'".format(
                    self.norm))

    def normalize(self, text: str, return_list=False):
        res = None
        token_list = None
        # Лемматизация
        if self.norm == "lemmatization":
            # PyMystem3 не поддерживает английский
            # NLTK.WordNetLemmatizer не поддерживает русский
            if self.language == "ru":
                token_list = self.alg.lemmatize(text)
            elif self.language == "en":
                token_list = list(map(self.alg.lemmatize, text.split()))
        # Стемминг
        elif self.norm == "stemming":
            token_list = list(map(self.alg.stem, text.split()))
        # Выбор формата результата
        if not return_list:
            res = " ".join(remove_empty_items(token_list))
        else:
            res = token_list
        return res
class Lemmatizer(PreprocessingStep):
    COLUMN_LEMMA = "lemma"
    """
    spacy lemma file precalculation example:
    
    Lemmatizer().precalculate_spacy_english_lemmatizer([Etour(), Itrust()])
    
    """
    class LemmatizerType(Enum):
        english_nltk = 1
        english_spacy = 2
        italian_nltk = 3  # is a stemmer, nltk does not have an italian lemmatizer
        italian_spacy = 4

    def __init__(self, lemmatizer_type=LemmatizerType.english_nltk):
        self._lemmatizer_type = lemmatizer_type
        self._lemmatizer = None
        if lemmatizer_type == self.LemmatizerType.english_nltk:
            self._lemmatizer = WordNetLemmatizer()
        elif lemmatizer_type == self.LemmatizerType.english_spacy:
            # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once
            if not FileUtil.file_exists(PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV):
                log.error(
                    f"{PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file."
                )
            self._lemmatizer = PandasUtil.read_csv_to_dataframe(
                PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV)
        elif lemmatizer_type == self.LemmatizerType.italian_nltk:
            self._lemmatizer = SnowballStemmer("italian")
        elif lemmatizer_type == self.LemmatizerType.italian_spacy:
            # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once
            if not FileUtil.file_exists(PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV):
                log.error(
                    f"{PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file."
                )
            self._lemmatizer = PandasUtil.read_csv_to_dataframe(
                PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
        else:
            log.error(f"Unknown case for LemmatizerType: {lemmatizer_type}")

    def execute(self, text_tokens):
        if self._lemmatizer_type == self.LemmatizerType.english_nltk:
            return [self._lemmatizer.lemmatize(token) for token in text_tokens]
        elif self._lemmatizer_type == self.LemmatizerType.english_spacy or self._lemmatizer_type == self.LemmatizerType.italian_spacy:
            return [
                self._lemmatizer.at[token, self.COLUMN_LEMMA]
                if token in self._lemmatizer.index else token
                for token in text_tokens
            ]
        if self._lemmatizer_type == self.LemmatizerType.italian_nltk:
            return [self._lemmatizer.stem(token) for token in text_tokens]

    @classmethod
    def _precalculate_spacy_lemmatizer(cls, spacy_lemmatizer, datasets,
                                       output_path):
        dataset_tuples = []
        for dataset in datasets:
            req_tokenizer = WordTokenizer(dataset, not dataset.is_english())
            req_pre = Preprocessor([
                UrlRemover(),
                Separator(),
                NonLetterFilter(),
                CamelCaseSplitter(),
                LowerCaseTransformer()
            ])
            code_tokenizer = JavaCodeASTTokenizer(
                dataset, WordTokenizer(dataset, not dataset.is_english()))
            code_pre = Preprocessor([
                UrlRemover(),
                Separator(),
                NonLetterFilter(),
                CamelCaseSplitter(),
                JavaCodeStopWordRemover(not dataset.is_english()),
                LowerCaseTransformer()
            ])
            dataset_tuples.append(
                (dataset, code_pre, code_tokenizer, req_pre, req_tokenizer))

        word_to_lemma_map = {}

        def iterate_files(tokenizer, preprecessor, folder):
            for file in FileUtil.get_files_in_directory(folder, True):
                file_representation = tokenizer.tokenize(file)
                file_representation.preprocess(preprecessor)
                for word in file_representation.token_list:
                    lemma = [token.lemma_ for token in spacy_lemmatizer(word)]
                    if len(lemma) > 1:
                        log.info(
                            f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma"
                        )
                    lemma = "".join(lemma)
                    if word in word_to_lemma_map:
                        if not word_to_lemma_map[word] == lemma:
                            log.info(
                                f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}"
                            )
                    else:
                        word_to_lemma_map[word] = lemma

        for dataset, code_pre, code_tok, req_pre, req_tok in dataset_tuples:

            iterate_files(req_tok, req_pre, dataset.req_folder())
            iterate_files(code_tok, code_pre, dataset.code_folder())

        word_to_lemma_dataframe = pandas.DataFrame.from_dict(
            word_to_lemma_map, orient="index", columns=[cls.COLUMN_LEMMA])
        PandasUtil.write_dataframe_to_csv(word_to_lemma_dataframe, output_path)

    @classmethod
    def precalculate_spacy_english_lemmatizer(cls, datasets):
        cls._precalculate_spacy_lemmatizer(
            en_core_web_trf.load(disable=['ner', 'parser']), datasets,
            PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV
        )  # we only need the lemmatizer component, disable the other

    @classmethod
    def precalculate_spacy_italian_lemmatizer(cls, datasets):
        cls._precalculate_spacy_lemmatizer(
            it_core_news_lg.load(disable=['ner', 'parser']), datasets,
            PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
Esempio n. 4
0
class NLP():
    def __init__(self, remove_stopwords=True, replace_words=True,
                 remove_numbers=True, remove_html_tags=True,
                 remove_punctuations=True, lemmatize=False,
                 lemmatize_method='wordnet'):
        """
        This package contains functions that can help during the
        preprocessing of text data.
        :param remove_stopwords: boolean
            default value = True
        :param replace_words: boolean
            default value = True
        """
        if (type(remove_stopwords) != bool or
            type(replace_words) != bool or
            type(remove_numbers) != bool or
            type(remove_html_tags) != bool or
            type(remove_punctuations) != bool or
            type(lemmatize) != bool):
            raise Exception("Error - expecting a boolean parameter")
        if lemmatize_method not in ['wordnet', 'snowball']:
            raise Exception("Error - lemmatizer method not supported")
        self.doc = None
        self.lemmatizer = None
        self.remove_stopwords = remove_stopwords
        self.replace_words = replace_words
        self.remove_numbers = remove_numbers
        self.remove_html_tags = remove_html_tags
        self.remove_punctations = remove_punctuations
        self.lemmatize_method = lemmatize_method
        self.lemmatize = lemmatize
        self.stopword_list = set(stopwords)
        self.replacement_list = to_replace
        if self.lemmatize_method == 'wordnet':
            self.lemmatizer = WordNetLemmatizer()
        if self.lemmatize_method == 'snowball':
            self.lemmatizer = SnowballStemmer('english')
    

    def remove_stopwords_fun(self):
        """
        This function removes stopwords from doc.
        It works by tokenizing the doc and then
        checking if the word is present in stopwords
        """
        tokens = str(self.doc).split()
        cleaned_tokens = [token for token in tokens
                          if token.lower() not in self.stopword_list]
        self.doc = ' '.join(cleaned_tokens)

    def replace_words_fun(self):
        """
        This function replaces words that are --
        by checking a word if a word is present in a dictionary
        if the word is present in dictionary then it is replaced
        with its value from dictionary
        """

        cleaned_doc = []
        for word in str(self.doc).split():
            if word.lower() in self.replacement_list.keys():
                cleaned_doc.append(self.replacement_list[word.lower()])
            else:
                cleaned_doc.append(word)
        self.doc = ' '.join(cleaned_doc)

    def remove_numbers_fun(self):
        """
        This function uses regex to remve
        all the numbers from the doc.
        """
        self.doc = re.sub("[0-9]", "", self.doc)

    def remove_html_tags_fun(self):
        """
        This function uses regex's complile method
        to remove all the HTML tags from the doc
        """
        cleaner = re.compile('<.*?>')
        cleaned_text = re.sub(cleaner, '', self.doc)
        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
        self.doc = cleaned_text

    def remove_punctations_fun(self):
        """
        This function uses regex to remove alk the
        punctations from the doc.
        """ 
        self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)

    def lemmatize_fun(self):
        """
        This function applies the stemming to the words
        It can be operated with either WordNetLemmatizer
        or Snowball Stemmer
        ---------------------------
        Example:
        lemmatize(method='snowball')
        
        default value = 'wordnet
        """
        tokens = str(self.doc).split()
        cleaned_tokens = None
        if self.lemmatize_method == 'wordnet':
            cleaned_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        else:
            cleaned_tokens = [self.lemmatizer.stem(token) for token in tokens]
       
        self.doc = ' '.join(cleaned_tokens)

    def add_stopword(self, *args):
        """
        This function is used to add new stopwords
        to the predefined list
        Parameters - ["new_stopword"]
        ------------------------------
        Example -
        obj = NLP()
        obj.add_stopword(["first_word", "second_word"])
        """
        if self.remove_stopwords is False:
            raise Exception("Please enable removal of stopwords")
        if type(args) != list:
            raise Exception("Error - pass stopwords in list")
        for arg in args:
            self.stopword_list.add(arg)

    def add_replacement(self, *args):
        """
        This function is used to add new replacement words
        to the predifined list
        Parameters - [  = ""]
        ----------------------------
        Example -
        obj = NLP()
        obj.add_replacement([first: "replacement1", second: "replacement2"])
        """
        if self.replace_words is False:
            raise Exception("Please enable cleaning of stopwords")
        if type(args) != list:
            raise Exception("Error - pass input parameters in list")
        if args == []:
            raise Exception("Error - list is empty")
        try:
            for key, value in args.items():
                self.replacement_list[key] = value
        except:
            print("Expected args in dict format")

    def remove_stopwords(self, *args):
        """
        This function is used to remove stopwords from predefined list
        Parameters - ["first_word"]
        ------------------------------
        Example
        obj = NLP()
        obj.remove_stopwords(['new_stopword_here'])

        """
        if self.remove_stopwords is False:
            raise Exception("Error - enable stopword removal functionality")
        if type(args) != list:
            raise Exception("Error - expected a list")
        if args == []:
            raise Exception("Error - no items to remove from stopword list")
        for arg in args:
            if arg in self.stopword_list:
                self.stopword_list.remove(arg)
            else:
                raise Exception(arg+" not in list")

    def print_stopwords(self):
        """
        This function prints all the stopwords
        that are present in the list
        Return Type - list
        ------------------------------
        Example
        obj = NLP()
        obj.print_stopwords()
        """
        if self.stopword_list == []:
            raise Exception("Error - stopword list is empty")
        print(self.stopword_list)

    def process(self, doc):
        """
        This function processes the doc
        If the remove_stopwords flag is True
            - it will remove stopwords from doc
        If the clean_words flag is True
            - it will clean the doc by replacing words
        Parameters - [doc]
        ------------------------------
        Example
        obj = NLP()
        obj.process(["process this text"])

        How to use with pandas?
        obj = NLP()
        df = df['text].apply(obj.process)
        """
        self.doc = doc
        if self.replace_words is True:
            self.replace_words_fun()
        if self.remove_html_tags is True:
            self.remove_html_tags_fun()
        if self.remove_stopwords is True:
            self.remove_stopwords_fun()
        if self.remove_numbers is True:
            self.remove_numbers_fun()
        if self.remove_punctations is True:
            self.remove_punctations_fun() 
        if self.lemmatize is True:
            self.lemmatize_fun()
        return self.doc
Esempio n. 5
0
class PreProcessor():
    def __init__(self,
                 file_path=None,
                 doc_link=None,
                 folder_link=None,
                 remove_stopwords=True,
                 lower=True,
                 tokenize_word=True,
                 contraction_method='mapping',
                 remove_numbers=True,
                 remove_html_tags=True,
                 remove_punctuations=True,
                 remove_accented_chars=True,
                 remove_whitespace=True,
                 lemmatize_method='wordnet',
                 embedding_method='word2vec',
                 auto_correct=True):
        """
        This package contains functions that can help during the
        preprocessing of text data.
        :param remove_stopwords: boolean
            default value = True
        :param replace_words: str
            default value = regex
        """
        if (type(remove_stopwords) != bool or type(lower) != bool
                or type(tokenize_word) != bool or
                # type(tokenize_sent) != bool or
                type(remove_numbers) != bool or type(remove_html_tags) != bool
                or type(remove_punctuations) != bool
                or type(remove_accented_chars) != bool
                or type(auto_correct) != bool
                or type(remove_whitespace) != bool):
            raise Exception("Error - expecting a boolean parameter")
        if lemmatize_method not in ['wordnet', 'snowball']:
            raise Exception("Error - lemmatizer method not supported")
        else:
            self.lemmatize = True
        if contraction_method not in ['glove', 'word2vec', 'mapping']:
            raise Exception("Error - contraction method not supported")
        else:
            self.contractions = True
        if embedding_method not in ['glove', 'word2vec', 'bow']:
            raise Exception("Error - embedding method not supported")
        else:
            self.word_embedding = True
        if file_path == None and doc_link == None and folder_link == None:
            raise Exception("Error - expecting the file path")
        self.doc = None
        self.sents = None
        self.tweets = None
        self.lemmatizer = None
        self.file_path = file_path
        self.doc_link = doc_link
        self.folder_link = folder_link
        self.lower = lower
        self.remove_stopwords = remove_stopwords
        self.contraction_method = contraction_method
        self.embedding_method = embedding_method
        self.remove_numbers = remove_numbers
        self.remove_html_tags = remove_html_tags
        self.remove_punctations = remove_punctuations
        self.remove_accented_chars = remove_accented_chars
        self.remove_whitespace = remove_whitespace
        self.lemmatize_method = lemmatize_method
        self.stopword_list = stopwords.words('english')
        self.replacement_list = to_replace
        self.tokenize_word = tokenize_word
        # self.tokenize_sent = tokenize_sent
        self.auto_correct = auto_correct
        if self.lemmatize_method == 'wordnet':
            self.lemmatizer = WordNetLemmatizer()
        if self.lemmatize_method == 'snowball':
            self.lemmatizer = SnowballStemmer('english')

    def file_reader(self):
        file_content = prepare_text(self.file_path, dolower=False)
        return file_content

    def doc_downloader(self, document_link, document_type, document_name):
        # Extracting the ID from the given link
        pattern = r"(?<=d/)(.+)(?=/)"
        DOCUMENT_ID = re.findall(pattern, document_link)[0]
        print(f"DOCUMENT ID: {DOCUMENT_ID}")

        # Specifying the format in which the document will be downloaded
        if document_type.lower() in ['docx', "doc"]:
            file_format = "docx"
        elif document_type.lower() in ['pdf']:
            file_format = "pdf"
        else:
            print(
                "Document Format Not Supported. Only Docs, Doc and PDF are supported"
            )
            return None

        creds = None

        if os.path.exists(token_file):
            with open(token_file, 'rb') as token:
                creds = pickle.load(token)

        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    credentials_json, SCOPES)
                creds = flow.run_local_server(port=0)
            with open(token_file, 'wb') as token:
                pickle.dump(creds, token)
        service = build('drive', 'v3', credentials=creds)

        file_name = '.'.join([document_name, file_format])
        try:
            print("Downloading file")
            request = service.files().get_media(fileId=DOCUMENT_ID)
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fd=fh, request=request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print(f"Download {status.progress()*100}")
        except:
            print("Downloading MS Word Document file")
            request = service.files().export_media(
                fileId=DOCUMENT_ID,
                mimeType=
                'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
            )
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fd=fh, request=request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print(f"Download {status.progress()*100}")

        fh.seek(0)
        with open(os.path.join(file_storage, file_name), 'wb') as f:
            f.write(fh.read())
            f.close()
            print("SAVED")

    def folder_downloader(self, folder_link):
        # Extracting the ID from the given link
        pattern = r'(?<=folders/)(\w+)'
        DOCUMENT_ID = re.findall(pattern, folder_link)[0]
        print(f"DOCUMENT ID: {DOCUMENT_ID}")

        creds = None

        if os.path.exists(token_file):
            with open(token_file, 'rb') as token:
                creds = pickle.load(token)

        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    credentials_json, SCOPES)
                creds = flow.run_local_server(port=0)
            with open(token_file, 'wb') as token:
                pickle.dump(creds, token)
        service = build('drive', 'v3', credentials=creds)

        listofFiles = []
        page_token = None
        # docx_query = f"'{DOCUMENT_ID}' in parents and mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document'"
        # pdf_query = f"'{DOCUMENT_ID}' in parents and mimeType='application/pdf'"
        # txt_query = f"'{DOCUMENT_ID}' in parents and mimeType='text/plain'"
        query = f"'{DOCUMENT_ID}' in parents"
        while True:
            response = service.files().list(
                q=query,
                fields='nextPageToken, files(id, name)',
                pageToken=page_token,
                includeItemsFromAllDrives=True,
                supportsAllDrives=True).execute()
            for file in response.get('files', []):
                listofFiles.append(file)

            page_token = response.get('nextPageToken', None)
            if page_token is None:
                break

        for item in listofFiles:
            document_id = item['id']
            file_name = item['name']
            name_splitted = file_name.split(".")
            if len(name_splitted) == 1:
                file_name = '.'.join([file_name, "docx"])
            try:
                print("Downloading docx file")
                print(file_name)
                request = service.files().get_media(fileId=document_id)
                fh = io.BytesIO()
                downloader = MediaIoBaseDownload(fd=fh, request=request)
                done = False
                while done is False:
                    status, done = downloader.next_chunk()
                    print(f"Download {status.progress()*100}")
            except:
                print("Downloading doc file")
                print(file_name)
                request = service.files().export_media(
                    fileId=document_id,
                    mimeType=
                    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                )
                fh = io.BytesIO()
                downloader = MediaIoBaseDownload(fd=fh, request=request)
                done = False
                while done is False:
                    status, done = downloader.next_chunk()
                    print(f"Download {status.progress()*100}")
            fh.seek(0)
            with open(file_storage + '/' + file_name, 'wb') as f:
                f.write(fh.read())
                f.close()

    def lower_fun(self):
        """
        This function converts text to lower
        """
        self.doc = self.doc.lower()

    def remove_stopwords_fun(self):
        """
        This function removes stopwords from doc.
        It works by tokenizing the doc and then
        checking if the word is present in stopwords
        """
        # tokens = str(self.doc).split()
        tokens = word_tokenize(self.doc)
        cleaned_tokens = [
            token for token in tokens
            if token.lower() not in self.stopword_list
        ]

        self.doc = ' '.join(cleaned_tokens)

    def word_embedding_fun(self):
        # if(self.tokenize_sent==False):
        #     self.doc = sent_tokenize(self.doc)
        if (self.tokenize_word == False):
            self.tokenize_word_fun()
        if self.embedding_method == 'glove':
            model = api.load("glove-twitter-25")
            vecs = []
            for x in self.doc:
                vec = [model[i] for i in x]
                vecs.append(vec)
                self.doc = vecs
            # print(vecs)
        elif self.embedding_method == 'word2vec':
            pass
        elif self.embedding_method == 'bow':
            pass

    def mapping_decontraction(self, phrase):
        cleaned_doc = []
        for word in str(self.doc).split():
            if word.lower() in self.replacement_list.keys():
                cleaned_doc.append(self.replacement_list[word.lower()])
            else:
                cleaned_doc.append(word)
        phrase = ' '.join(cleaned_doc)
        return phrase

    def contractions_fun(self):
        """
        This function replaces words that are --
        by checking a word if a word is present in a dictionary
        if the word is present in dictionary then it is replaced
        with its value from dictionary
        """
        if self.contraction_method == 'mapping':
            self.doc = self.mapping_decontraction(str(self.doc))
        elif self.contraction_method == 'word2vec':
            model = pretrained_model
            cont = Contractions(model)
            cont.load_models()
            self.doc = list(cont.expand_texts([str(self.doc)],
                                              precise=True))[0]
        elif self.contraction_method == 'glove':
            model = api.load("glove-twitter-25")
            cont = Contractions(kv_model=model)
            cont.load_models()
            self.doc = list(cont.expand_texts([str(self.doc)],
                                              precise=True))[0]

    def remove_numbers_fun(self):
        """
        This function uses regex to remve
        all the numbers from the doc.
        """
        self.doc = re.sub("[0-9]", "", self.doc)
        self.doc = self.doc.strip()
        self.doc = " ".join(self.doc.split())

    def autocorrect_fun(self):
        spell = Speller(lang='en')
        self.doc = [spell(w) for w in word_tokenize(self.doc)]

    def remove_html_tags_fun(self):
        """
        This function uses regex's complile method
        to remove all the HTML tags from the doc
        """
        cleaner = re.compile('<.*?>')
        cleaned_text = re.sub(cleaner, '', self.doc)
        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
        self.doc = cleaned_text.strip()
        self.doc = " ".join(self.doc.split())

    def remove_punctations_fun(self):
        """
        This function uses regex to remove alk the
        punctations from the doc.
        """
        self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)
        self.doc = self.doc.strip()
        self.doc = " ".join(self.doc.split())

    def remove_accented_chars_fun(self):
        """remove accented characters from text, e.g. café"""
        self.doc = unidecode.unidecode(self.doc)

    def remove_whitespace_fun(self):
        """remove extra whitespaces from text"""
        text = self.doc.strip()
        self.doc = " ".join(text.split())

    def tokenize_word_fun(self):
        """tokenizes the sentences to words"""
        self.doc = word_tokenize(self.doc)

    # def tokenize_sent_fun(self):
    #     """tokenizes the paragraphs to sentences"""
    #     self.sents = sent_tokenize(self.doc)

    def lemmatize_fun(self):
        """
        This function applies the stemming to the words
        It can be operated with either WordNetLemmatizer
        or Snowball Stemmer
        ---------------------------
        Example:
        lemmatize(method='snowball')
        
        default value = 'wordnet
        """
        cleaned_tokens = None
        if self.lemmatize_method == 'wordnet':
            cleaned_tokens = [
                self.lemmatizer.lemmatize(token) for token in self.doc
            ]
        elif self.lemmatize_method == 'snowball':
            cleaned_tokens = [
                self.lemmatizer.stem(token) for token in self.doc
            ]

        self.doc = ' '.join(cleaned_tokens)

    def add_stopword(self, *args):
        """
        This function is used to add new stopwords
        to the predefined list
        Parameters - ["new_stopword"]
        ------------------------------
        Example -
        obj = NLP()
        obj.add_stopword(["first_word", "second_word"])
        """
        if self.remove_stopwords is False:
            raise Exception("Please enable removal of stopwords")
        if type(args) != list:
            raise Exception("Error - pass stopwords in list")
        for arg in args:
            self.stopword_list.add(arg)

    def print_stopwords(self):
        """
        This function prints all the stopwords
        that are present in the list
        Return Type - list
        ------------------------------
        Example
        obj = NLP()
        obj.print_stopwords()
        """
        if self.stopword_list == []:
            raise Exception("Error - stopword list is empty")
        print(self.stopword_list)

    def process(self):
        """
        This function processes the doc
        If the remove_stopwords flag is True
            - it will remove stopwords from doc
        If the clean_words flag is True
            - it will clean the doc by replacing words
        Parameters - [doc]
        ------------------------------
        Example
        obj = NLP()
        obj.process(["process this text"])

        How to use with pandas?
        obj = NLP()
        df = df['text].apply(obj.process)
        """
        if self.file_path != None:
            data = self.file_reader()
        if self.doc_link != None:
            self.doc_downloader(self.doc_link, "docx", "testing_document")
            path = file_storage + '/testing_document.docx'
            data = prepare_text(path, dolower=False)
        if self.folder_link != None:
            self.folder_downloader(self.folder_link)
            data = 'test'
        output = []
        self.sents = sent_tokenize(data)
        for doc in self.sents:
            self.doc = doc
            if self.lower is True:
                self.lower_fun()
            if self.contractions is True:
                self.contractions_fun()
            if self.remove_html_tags is True:
                self.remove_html_tags_fun()
            if self.remove_numbers is True:
                self.remove_numbers_fun()
            if self.remove_punctations is True:
                self.remove_punctations_fun()
            if self.remove_accented_chars is True:
                self.remove_accented_chars_fun()
            if self.remove_stopwords is True:
                self.remove_stopwords_fun()
            if self.remove_whitespace is True:
                self.remove_whitespace_fun()
            if self.auto_correct is True:
                self.autocorrect_fun()
            if self.lemmatize is True:
                self.lemmatize_fun()
            if self.tokenize_word is True:
                self.tokenize_word_fun()
            if self.word_embedding is True:
                self.word_embedding_fun()
            output.append(self.doc)
        return output
def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]
Esempio n. 7
0
class Lemmatizer(PreprocessingStep):
    COLUMN_LEMMA = "lemma"

    class LemmatizerType(Enum):
        english_nltk = 1
        english_spacy = 2
        italian_nltk = 3  # is a stemmer, nltk does not have an italian lemmatizer
        italian_spacy = 4

    def __init__(self, lemmatizer_type=LemmatizerType.english_nltk):
        self._lemmatizer_type = lemmatizer_type
        self._lemmatizer = None
        if lemmatizer_type == self.LemmatizerType.english_nltk:
            self._lemmatizer = WordNetLemmatizer()
        elif lemmatizer_type == self.LemmatizerType.english_spacy:
            # Use precalculated files for spacy since google colab can't handle fasttext model and spacy lemmatizer at once
            self._lemmatizer = FileUtil.read_csv_to_dataframe(
                Paths.PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV)
        elif lemmatizer_type == self.LemmatizerType.italian_nltk:
            self._lemmatizer = SnowballStemmer("italian")
        elif lemmatizer_type == self.LemmatizerType.italian_spacy:
            # Use precalculated files for spacy since google colab can't handle fasttext model and spacy lemmatizer at once
            self._lemmatizer = FileUtil.read_csv_to_dataframe(
                Paths.PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
        else:
            log.error(f"Unknown case for LemmatizerType: {lemmatizer_type}")

    def execute(self, text_tokens, file_name, javadoc):
        if self._lemmatizer_type == self.LemmatizerType.english_nltk:
            return [self._lemmatizer.lemmatize(token) for token in text_tokens]
        elif self._lemmatizer_type == self.LemmatizerType.english_spacy or self._lemmatizer_type == self.LemmatizerType.italian_spacy:
            return [
                self._lemmatizer.at[token, self.COLUMN_LEMMA]
                if token in self._lemmatizer.index else token
                for token in text_tokens
            ]
        if self._lemmatizer_type == self.LemmatizerType.italian_nltk:
            return [self._lemmatizer.stem(token) for token in text_tokens]

    @classmethod
    def _precalculate_spacy_lemmatizer(cls, spacy_lemmatizer, dataset_tuple,
                                       output_path):
        word_to_lemma_map = {}

        def iterate_files(tokenizer, preprecessor, folder):
            for file in FileUtil.get_files_in_directory(folder, True):
                file_representation = tokenizer.tokenize(file)
                file_representation.preprocess(preprecessor)
                for word in file_representation.token_list:
                    lemma = [token.lemma_ for token in spacy_lemmatizer(word)]
                    if len(lemma) > 1:
                        log.info(
                            f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma"
                        )
                    lemma = "".join(lemma)
                    if word in word_to_lemma_map:
                        if not word_to_lemma_map[word] == lemma:
                            log.info(
                                f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}"
                            )
                    else:
                        word_to_lemma_map[word] = lemma

        for dataset, code_pre, code_tok, req_pre, req_tok in dataset_tuple:

            iterate_files(req_tok, req_pre, dataset.req_folder())
            iterate_files(code_tok, code_pre, dataset.code_folder())

        word_to_lemma_dataframe = pandas.DataFrame.from_dict(
            word_to_lemma_map, orient="index", columns=[cls.COLUMN_LEMMA])
        FileUtil.write_dataframe_to_csv(word_to_lemma_dataframe, output_path)

    @classmethod
    def precalculate_spacy_english_lemmatizer(cls, dataset_tuple):
        cls._precalculate_spacy_lemmatizer(
            en_core_web_trf.load(disable=['ner', 'parser']), dataset_tuple,
            Paths.PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV
        )  # we only need the lemmatizer component, disable the other

    @classmethod
    def precalculate_spacy_italian_lemmatizer(cls, dataset_tuple):
        cls._precalculate_spacy_lemmatizer(
            it_core_news_lg.load(disable=['ner', 'parser']), dataset_tuple,
            Paths.PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
class NLP():
    def __init__(self,
                 remove_stopwords=True,
                 replace_words=True,
                 remove_numbers=True,
                 remove_html_tags=True,
                 remove_punctuations=True,
                 lemmatize=False,
                 lemmatize_method='wordnet'):
        """
        This package contains functions that can help during the
        preprocessing of text data.
        :param remove_stopwords: boolean
            default value = True
        :param replace_words: boolean
            default value = True
        """
        if (type(remove_stopwords) != bool or type(replace_words) != bool
                or type(remove_numbers) != bool
                or type(remove_html_tags) != bool
                or type(remove_punctuations) != bool
                or type(lemmatize) != bool):
            raise Exception("Error - expecting a boolean parameter")
        if lemmatize_method not in ['wordnet', 'snowball']:
            raise Exception("Error - lemmatizer method not supported")
        self.doc = None
        self.tweets = None
        self.lemmatizer = None
        self.remove_stopwords = remove_stopwords
        self.replace_words = replace_words
        self.remove_numbers = remove_numbers
        self.remove_html_tags = remove_html_tags
        self.remove_punctations = remove_punctuations
        self.lemmatize_method = lemmatize_method
        self.lemmatize = lemmatize
        self.stopword_list = set(stopwords)
        self.replacement_list = to_replace
        if self.lemmatize_method == 'wordnet':
            self.lemmatizer = WordNetLemmatizer()
        if self.lemmatize_method == 'snowball':
            self.lemmatizer = SnowballStemmer('english')

    def remove_stopwords_fun(self):
        """
        This function removes stopwords from doc.
        It works by tokenizing the doc and then
        checking if the word is present in stopwords
        """
        tokens = str(self.doc).split()
        cleaned_tokens = [
            token for token in tokens
            if token.lower() not in self.stopword_list
        ]
        self.doc = ' '.join(cleaned_tokens)

    def replace_words_fun(self):
        """
        This function replaces words that are --
        by checking a word if a word is present in a dictionary
        if the word is present in dictionary then it is replaced
        with its value from dictionary
        """

        cleaned_doc = []
        for word in str(self.doc).split():
            if word.lower() in self.replacement_list.keys():
                cleaned_doc.append(self.replacement_list[word.lower()])
            else:
                cleaned_doc.append(word)
        self.doc = ' '.join(cleaned_doc)

    def remove_numbers_fun(self):
        """
        This function uses regex to remve
        all the numbers from the doc.
        """
        self.doc = re.sub("[0-9]", "", self.doc)

    def remove_html_tags_fun(self):
        """
        This function uses regex's complile method
        to remove all the HTML tags from the doc
        """
        cleaner = re.compile('<.*?>')
        cleaned_text = re.sub(cleaner, '', self.doc)
        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
        self.doc = cleaned_text

    def remove_punctations_fun(self):
        """
        This function uses regex to remove alk the
        punctations from the doc.
        """
        self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)

    def lemmatize_fun(self):
        """
        This function applies the stemming to the words
        It can be operated with either WordNetLemmatizer
        or Snowball Stemmer
        ---------------------------
        Example:
        lemmatize(method='snowball')
        
        default value = 'wordnet
        """
        tokens = str(self.doc).split()
        cleaned_tokens = None
        if self.lemmatize_method == 'wordnet':
            cleaned_tokens = [
                self.lemmatizer.lemmatize(token) for token in tokens
            ]
        else:
            cleaned_tokens = [self.lemmatizer.stem(token) for token in tokens]

        self.doc = ' '.join(cleaned_tokens)

    def add_stopword(self, *args):
        """
        This function is used to add new stopwords
        to the predefined list
        Parameters - ["new_stopword"]
        ------------------------------
        Example -
        obj = NLP()
        obj.add_stopword(["first_word", "second_word"])
        """
        if self.remove_stopwords is False:
            raise Exception("Please enable removal of stopwords")
        if type(args) != list:
            raise Exception("Error - pass stopwords in list")
        for arg in args:
            self.stopword_list.add(arg)

    def add_replacement(self, *args):
        """
        This function is used to add new replacement words
        to the predifined list
        Parameters - [  = ""]
        ----------------------------
        Example -
        obj = NLP()
        obj.add_replacement([first: "replacement1", second: "replacement2"])
        """
        if self.replace_words is False:
            raise Exception("Please enable cleaning of stopwords")
        if type(args) != list:
            raise Exception("Error - pass input parameters in list")
        if args == []:
            raise Exception("Error - list is empty")
        try:
            for key, value in args.items():
                self.replacement_list[key] = value
        except:
            print("Expected args in dict format")

    def remove_stopwords(self, *args):
        """
        This function is used to remove stopwords from predefined list
        Parameters - ["first_word"]
        ------------------------------
        Example
        obj = NLP()
        obj.remove_stopwords(['new_stopword_here'])

        """
        if self.remove_stopwords is False:
            raise Exception("Error - enable stopword removal functionality")
        if type(args) != list:
            raise Exception("Error - expected a list")
        if args == []:
            raise Exception("Error - no items to remove from stopword list")
        for arg in args:
            if arg in self.stopword_list:
                self.stopword_list.remove(arg)
            else:
                raise Exception(arg + " not in list")

    def print_stopwords(self):
        """
        This function prints all the stopwords
        that are present in the list
        Return Type - list
        ------------------------------
        Example
        obj = NLP()
        obj.print_stopwords()
        """
        if self.stopword_list == []:
            raise Exception("Error - stopword list is empty")
        print(self.stopword_list)

    def process(self, doc):
        """
        This function processes the doc
        If the remove_stopwords flag is True
            - it will remove stopwords from doc
        If the clean_words flag is True
            - it will clean the doc by replacing words
        Parameters - [doc]
        ------------------------------
        Example
        obj = NLP()
        obj.process(["process this text"])

        How to use with pandas?
        obj = NLP()
        df = df['text].apply(obj.process)
        """
        self.doc = doc
        if self.replace_words is True:
            self.replace_words_fun()
        if self.remove_html_tags is True:
            self.remove_html_tags_fun()
        if self.remove_stopwords is True:
            self.remove_stopwords_fun()
        if self.remove_numbers is True:
            self.remove_numbers_fun()
        if self.remove_punctations is True:
            self.remove_punctations_fun()
        if self.lemmatize is True:
            self.lemmatize_fun()
        return self.doc

    def processTweet(self, tweets):
        """
        Expects tweets to be a pandas series
        Example use-case:
              tweets = processTweet(tweets)
        ______________________________________
        • Lower-casing
        • Normalizing URLs
        • Normalizing Tags and email addresses
        • Normalizing Numbers
        • Normalizing Dollars
        • Normalize punctuation
        • Removal of composition
        • Removal of punctuation
        • Word Stemming (Porter Stemmer)
        """
        self.tweets = tweets
        # Lower case text
        tweets = tweets.str.lower()

        # Account Tag @theFakeDonaldTrump
        tweets = tweets.str.replace(r"@[^\s]+", 'idaddr')

        # Email address
        tweets = tweets.str.replace(r"[^\s]+@[^\s]+", 'emailaddr')

        # Handle URLS
        # Look for strings starting with http:// or https://
        tweets = tweets.str.replace(r"(http|https)://[^\s]*", 'httpaddr')

        # Handle Numbers
        # Look for one or more characters between 0-9
        tweets = tweets.str.replace(r"[0-9]+", 'number')

        # Handle $ sign
        tweets = tweets.str.replace(r"[$]+", 'dollar')

        # Normalize punctuation
        transl_table = dict([(ord(x), ord(y))
                             for x, y in zip(u"‘’´“”–-", u"'''\"\"--")])
        tweets = tweets.apply(lambda a: a.translate(transl_table))

        # Expand Contractions
        tweets = tweets.apply(lambda string: " ".join([
            to_replace[i] if i in to_replace.keys() else i
            for i in string.split()
        ]))

        # Handle punctuation
        tweets = tweets.str.replace(r"[^\w]+", ' ')

        # Stem
        stemmer = nltk.stem.PorterStemmer()
        tweets = tweets.apply(lambda a: list(map(stemmer.stem, a.split())))

        return tweets