Esempio n. 1
0
    def __init__(self,
                 docs: list,
                 labels: list,
                 stopword_language: str = None,
                 clean_up_rex: str = None,
                 remove_stopwords: bool = True):
        """
        This constructor set the initial values for the Doc2VecHandler.
            :param docs:list: list of documents
            :param labels:list: list of labels for the documents 
            :param stopword_language:str: language for the stopword corpus
            :param clean_up_rex:str: text cleanup regex pattern
        """
        try:
            self._remove_stopwords = remove_stopwords
            self._cleanup_rex = clean_up_rex if isNotNone(
                clean_up_rex) else r"[^A-Za-züÜäÄöÖ0-9^'-]"
            self._stopword_language = stopword_language if isNotNone(
                stopword_language) else "german"

            if hasContent(docs) and hasContent(labels) and (len(docs)
                                                            == len(labels)):
                self._docs = docs
                self._labels = labels

        except Exception as ex:
            template = "An exception of type {0} occurred in [Doc2VecHandler.Constructor]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
Esempio n. 2
0
    def SentencesForWordsAPICollector(self, words: list, min_count: int = 10):
        """
        This function collects sentences for list of words from given API 
        and store them in a local folder as raw dataset elements in text files!
            :param words:list: list of words where sentences have to be collected for
            :param min_count:int: the minimal amount of sentences to get by calling the API less results will expell the word
        """
        try:
            if not os.path.exists(self.DATASET_RAW_PATH):
                os.mkdir(self.DATASET_RAW_PATH)

            for word in words:
                word_sentences_results = UniLeipzigAPICaller(
                    word, self.COLLECTING_LIMIT, self.COLLECT_API_BASE_URL,
                    self.COLLECT_CORPUS,
                    self.COLLECT_TASK).GetFoundSentences()
                if isNotNone(word_sentences_results) and (
                        len(word_sentences_results) >= min_count):
                    composite_path = self.DATASET_RAW_PATH + word + "." + self.DATASET_SINGLE_FILE_TYP
                    open(composite_path, "w+").close()
                    Writer(input_path=composite_path,
                           in_elements=word_sentences_results,
                           in_context=None)
                else:
                    if isNotNone(word_sentences_results):
                        print("Word [", word, "] had ",
                              len(word_sentences_results), "results!")
        except Exception as ex:
            template = "An exception of type {0} occurred in [Main.SentencesForWordsAPICollector]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
            sys.exit(1)
Esempio n. 3
0
 def GetCategories(self):
     """
     This method returns the categories.
     """   
     try:    
         if isNotNone(self._word) and isNotNone(self._categories) and CheckAnyListElementSameType(self._categories, Category):
             return self._categories
         return None
     except Exception as ex:
         template = "An exception of type {0} occurred in [Word.GetCategories]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
Esempio n. 4
0
 def __init__(self, word:str, category:str, sentence:str):
     """
     This class is a model for a single sample containing a word, a category and a example sentence for it.
         :param word:str: word
         :param category:str: word category 
         :param sentence:str: example sentence for the categorized word
     """   
     try:    
         self._word = word if isNotNone(word) else None
         self._category = category if isNotNone(category) else None
         self._sentence = sentence if isNotNone(sentence) else None
     except Exception as ex:
         template = "An exception of type {0} occurred in [Sample.Constructor]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
 def __init__(self, file_name: str, json_name: str):
     """
     This is the json builder constructor which handles initializing th json file if not exist.
         :param file_name:str: file name
         :param json_name:str: json name
     """
     try:
         self._file_name = file_name if isNotNone(
             file_name) else "dataset.json"
         self._json_name = json_name if isNotNone(json_name) else "dataset"
         if not os.path.exists(self._file_name):
             self.InitJson()
     except Exception as ex:
         template = "An exception of type {0} occurred in [JsonBuilder.Constructor]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
 def __init__(self, *args):
     """
     This constructor collect some params and store them.
         :param *args: 
     """   
     try:
         self._content = args if isNotNone(args) else None
     except Exception as ex:
         template = "An exception of type {0} occurred in [Shuffler.Constructor]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
 def UrlBuilder(self):
     """
     This function constructs the url.
     """
     try:
         if isNotNone(self._search_word):
             self._search_url = self._base_url + self._corpus + "/" + self._task + "/" + self._search_word + self._search_url_param + str(
                 self._search_limit)
     except Exception as ex:
         template = "An exception of type {0} occurred in [UniLeipzigAPICaller.UrlBuilder]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
    def __init__(self,
                 input_path: str,
                 file_extender: str = None,
                 in_elements: list = None,
                 in_context: str = None):
        """
        This is the constructor of the Writer class.
            :param input_path:str: path of the input file
            :param file_extender:str: extender to create output file from input path
            :param in_elements:list: amr data pairs list like List<Array{sentence, semantic}>
            :param in_context:str: optional if no data pairs present use context
        """
        try:
            self._out_path = (input_path + '.' +
                              file_extender) if isNotEmptyString(
                                  file_extender) else input_path

            print("Store Results [", self._out_path, "]")

            if isNotNone(self._out_path):
                if isNotNone(in_elements):
                    self._elements = in_elements
                    self.StoreListElements()

                if isNotNone(in_context):
                    self._context = in_context
                    self.StoreStringContext()

                if (isNone(in_elements) and isNone(in_context)):
                    print("No Input was given for the FileWriter!")
            else:
                print("Given path for FileWriter was None!")

        except Exception as ex:
            template = "An exception of type {0} occurred in [FileWriter.Constructor]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    def __init__(
            self,
            word: str,
            result_limit: int,
            base_url: str = "http://api.corpora.uni-leipzig.de/ws/sentences/",
            corpus: str = "deu_news_2012_1M",
            task: str = "sentences"):
        """
        The constructor for the ApiCaller.
            :param word:str: desired word
            :param result_limit:int: limit of results
            :param base_url:str: base url of the api providing server
            :param corpus:str=: the desired corpus 
            :param task:str="sentences": the desired task
        """
        try:
            self._search_word = word if (isNotNone(word)
                                         and isNotEmptyString(word)) else None
            self._search_limit = result_limit if (
                isNotNone(result_limit) and isInt(result_limit)) else 1

            self._base_url = base_url if (
                isNotNone(base_url) and isNotEmptyString(base_url)
            ) else "http://api.corpora.uni-leipzig.de/ws/sentences/"
            self._corpus = corpus if (
                isNotNone(corpus)
                and isNotEmptyString(corpus)) else "deu_news_2012_1M"
            self._task = task if (isNotNone(task)
                                  and isNotEmptyString(task)) else "sentences"
            self._search_url_param = "?limit="

            self._search_url = None
        except Exception as ex:
            template = "An exception of type {0} occurred in [UniLeipzigAPICaller.Constructor]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
 def StoreStringContext(self):
     """
     This function saves stringified context into a given file.
     """
     try:
         with open(self._out_path, 'w+', encoding=self.ENCODING) as fileOut:
             if isNotNone(self._context) and isStr(self._context):
                 fileOut.write(self._context)
                 fileOut.flush()
     except ValueError:
         print('WRONG INPUT FOR [FileWriter.StoreStringContext]')
     except Exception as ex:
         template = "An exception of type {0} occurred in [FileWriter.StoreStringContext]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
Esempio n. 11
0
    def CategoriesReader(self):
        """
        This function provides file to category objects list file reader.
        """
        try:
            collected_sentences: list = []
            category_name: str = None
            categories: list = []

            with open(self._path, 'r+', encoding=self.ENCODING) as fileIn:
                for line in fileIn.readlines():
                    line = line.replace("\n", "")

                    if "[" in line and "]" in line:
                        if isNotNone(category_name) and hasContent(
                                collected_sentences):
                            categories.append(
                                Category(category_name, collected_sentences))
                            collected_sentences = []
                            category_name = None

                        category_name = line.replace("[", "").replace("]", "")

                    else:
                        if len(line) > 0: collected_sentences.append(line)

                if isNotNone(category_name) and hasContent(
                        collected_sentences):
                    categories.append(
                        Category(category_name, collected_sentences))

            return categories
        except Exception as ex:
            template = "An exception of type {0} occurred in [FileReader.CategoriesReader]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    def GetRequestJson(self):
        """
        This function returns the json response.
        """
        try:
            self.UrlBuilder()
            if isNotNone(self._search_url):
                response = requests.get(self._search_url)

                if isNotNone(response) and response.status_code is 200:
                    json_content = json.loads(response.content)

                    if json_content["count"] > 0:
                        return json_content
                #else:
                #    if (input("Request failed on ["+self._search_word+"]! Retry? (j/n)") is "j"):
                #        self.GetRequestJson()

            print("Request failed on [" + self._search_word + "]!")
            return None
        except Exception as ex:
            template = "An exception of type {0} occurred in [UniLeipzigAPICaller.GetRequestJson]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    def PlotSummary(self, file_name: str = None, show_shapes: bool = True):
        """
        This method print the model summary.
            :param file_name:str=None: 
            :param show_shapes:bool=True: 
        """
        try:
            file_name = (file_name + "_ModelGraph.png"
                         ) if isNotNone(file_name) else "ann__ModelGraph.png"

            plot_model(self._model, to_file=file_name, show_shapes=show_shapes)
        except Exception as ex:
            template = "An exception of type {0} occurred in [Model.PlotSummary]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
 def StoreListElements(self):
     """
     This function save a string collection to a given file.
     """
     try:
         with open(self._out_path, 'w+', encoding=self.ENCODING) as fileOut:
             for elem in self._elements:
                 if isNotNone(elem):
                     fileOut.write(elem + "\n")
                     fileOut.flush()
     except ValueError:
         print('WRONG INPUT FOR [FileWriter.StoreListElements]')
     except Exception as ex:
         template = "An exception of type {0} occurred in [FileWriter.StoreListElements]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
 def GetFoundSentences(self):
     """
     This function returns the sentences from get response.
     """
     try:
         if (self._task is "sentences"):
             sentences_list = []
             json = self.GetRequestJson()
             if isNotNone(json):
                 for sentence_obj in json['sentences']:
                     sentences_list.append(sentence_obj['sentence'])
                 return sentences_list
             else:
                 return None
     except Exception as ex:
         template = "An exception of type {0} occurred in [UniLeipzigAPICaller.GetFoundSentences]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
Esempio n. 16
0
 def CleanSentences(self, sentence: str = None):
     """
     This method returns a cleaned sentence.
         :param sentence:str: the sentence to clean
     """
     try:
         if isNotNone(sentence):
             text: str = re.sub(self._cleanup_rex, " ", sentence)
             text = text.lower().split()
             stops = set(stopwords.words(self._stopword_language))
             text = [w for w in text if not w in stops]
             return " ".join(text)
         else:
             print("Empty sentences expelled!")
             return None
     except Exception as ex:
         template = "An exception of type {0} occurred in [Doc2VecHandler.CleanSentences]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
 def __init__(self,
              init_shape: tuple,
              model_folder: str,
              categories: int = -1):
     """
     The constructor.
         :param init_shape:tuple: train data input shape
         :param model_folder:str: folder where the model staff should be saved
         :param categories:int: amount of categories
     """
     try:
         self._init_shape: tuple = init_shape
         self._categories: int = categories
         self._model_folder: str = model_folder if isNotNone(
             model_folder) else None
     except Exception as ex:
         template = "An exception of type {0} occurred in [Model.Constructor]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
Esempio n. 18
0
 def GenerateDatasetSamples(self):
     """
     This method generates a list of samples out of the word model list.
     """   
     try:
         if CheckAnyListElementSameType(self._word_samples, Word):
             samples:list = []
             for word in self._word_samples:
                 for category in word.GetCategories():
                     for sentence in category.GetSentences():
                         #TODO size of string and amount of minimum datasets please right here!
                         sample = Sample(word=word.GetName(), category=category.GetName(), sentence=sentence)
                         if isNotNone(sample): samples.append(sample)
             return samples 
         else:
             return None
     except Exception as ex:
         template = "An exception of type {0} occurred in [SampleGenerator.GenerateDatasetSamples]. Arguments:\n{1!r}"
         message = template.format(type(ex).__name__, ex.args)
         print(message)
Esempio n. 19
0
    def CreateDatasetFolder(self, name: str = None):
        """
        This class creates the initial dataset folder on None input or a desired dataset subfolder on given name"
            :param name:str: name of a dataset subfolder
        """
        try:
            folderpath = self._main_path + name if isNotNone(
                name) else self._main_path
            self._foldername = None

            if not os.path.exists(folderpath):
                os.mkdir(folderpath)

            if os.path.exists():
                print("Successfully created the directory %s " % name)
            else:
                print("Failed to create directory %s !" % name)
        except Exception as ex:
            template = "An exception of type {0} occurred in [FolderManager.CreateDatasetFolder]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    def PredictAndVisualize(self,
                            test_set,
                            categories: list,
                            decode_dict: dict,
                            test_words: list,
                            test_docs: list,
                            submission_file_name: str = "submission_all.csv",
                            isClasses: bool = True):
        """
        This method predict results for a given test set and store it in a csv file.
            :param test_set: predictable input
            :param categories:list: proba lables 
            :param decode_dict:dict: class decoder dicts
            :param test_words:list: prediction words
            :param test_docs:list: predictions docs
            :param submission_file_name:str: name of the prdicted results file
            :param isClasses:bool: predict classes otherwise probability
        """
        try:
            mapped: dict = {}
            predictions = self._model.predict_classes(
                test_set) if isClasses else self._model.predict_proba(test_set)

            submission_file_name = submission_file_name if isNotNone(
                submission_file_name) else "submission_all.csv"
            if isNotNone(test_words): mapped['Words'] = test_words
            if isNotNone(test_docs): mapped['Docs'] = test_docs
            if isNotNone(predictions): mapped['PredClasses'] = predictions
            if isNotNone(decode_dict) and isNotNone(predictions):
                mapped['ClassNames'] = [decode_dict[p] for p in predictions]
            submission = pd.DataFrame(mapped)
            if categories != None: submission.columns = categories
            submission.to_csv(submission_file_name, index=False)
            submission.head()
        except Exception as ex:
            template = "An exception of type {0} occurred in [Model.PredictAndVisualize]. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)