def __init__(self, docs: list, labels: list, stopword_language: str = None, clean_up_rex: str = None, remove_stopwords: bool = True): """ This constructor set the initial values for the Doc2VecHandler. :param docs:list: list of documents :param labels:list: list of labels for the documents :param stopword_language:str: language for the stopword corpus :param clean_up_rex:str: text cleanup regex pattern """ try: self._remove_stopwords = remove_stopwords self._cleanup_rex = clean_up_rex if isNotNone( clean_up_rex) else r"[^A-Za-züÜäÄöÖ0-9^'-]" self._stopword_language = stopword_language if isNotNone( stopword_language) else "german" if hasContent(docs) and hasContent(labels) and (len(docs) == len(labels)): self._docs = docs self._labels = labels except Exception as ex: template = "An exception of type {0} occurred in [Doc2VecHandler.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def SentencesForWordsAPICollector(self, words: list, min_count: int = 10): """ This function collects sentences for list of words from given API and store them in a local folder as raw dataset elements in text files! :param words:list: list of words where sentences have to be collected for :param min_count:int: the minimal amount of sentences to get by calling the API less results will expell the word """ try: if not os.path.exists(self.DATASET_RAW_PATH): os.mkdir(self.DATASET_RAW_PATH) for word in words: word_sentences_results = UniLeipzigAPICaller( word, self.COLLECTING_LIMIT, self.COLLECT_API_BASE_URL, self.COLLECT_CORPUS, self.COLLECT_TASK).GetFoundSentences() if isNotNone(word_sentences_results) and ( len(word_sentences_results) >= min_count): composite_path = self.DATASET_RAW_PATH + word + "." + self.DATASET_SINGLE_FILE_TYP open(composite_path, "w+").close() Writer(input_path=composite_path, in_elements=word_sentences_results, in_context=None) else: if isNotNone(word_sentences_results): print("Word [", word, "] had ", len(word_sentences_results), "results!") except Exception as ex: template = "An exception of type {0} occurred in [Main.SentencesForWordsAPICollector]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message) sys.exit(1)
def GetCategories(self): """ This method returns the categories. """ try: if isNotNone(self._word) and isNotNone(self._categories) and CheckAnyListElementSameType(self._categories, Category): return self._categories return None except Exception as ex: template = "An exception of type {0} occurred in [Word.GetCategories]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def __init__(self, word:str, category:str, sentence:str): """ This class is a model for a single sample containing a word, a category and a example sentence for it. :param word:str: word :param category:str: word category :param sentence:str: example sentence for the categorized word """ try: self._word = word if isNotNone(word) else None self._category = category if isNotNone(category) else None self._sentence = sentence if isNotNone(sentence) else None except Exception as ex: template = "An exception of type {0} occurred in [Sample.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args)
def __init__(self, file_name: str, json_name: str): """ This is the json builder constructor which handles initializing th json file if not exist. :param file_name:str: file name :param json_name:str: json name """ try: self._file_name = file_name if isNotNone( file_name) else "dataset.json" self._json_name = json_name if isNotNone(json_name) else "dataset" if not os.path.exists(self._file_name): self.InitJson() except Exception as ex: template = "An exception of type {0} occurred in [JsonBuilder.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def __init__(self, *args): """ This constructor collect some params and store them. :param *args: """ try: self._content = args if isNotNone(args) else None except Exception as ex: template = "An exception of type {0} occurred in [Shuffler.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def UrlBuilder(self): """ This function constructs the url. """ try: if isNotNone(self._search_word): self._search_url = self._base_url + self._corpus + "/" + self._task + "/" + self._search_word + self._search_url_param + str( self._search_limit) except Exception as ex: template = "An exception of type {0} occurred in [UniLeipzigAPICaller.UrlBuilder]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def __init__(self, input_path: str, file_extender: str = None, in_elements: list = None, in_context: str = None): """ This is the constructor of the Writer class. :param input_path:str: path of the input file :param file_extender:str: extender to create output file from input path :param in_elements:list: amr data pairs list like List<Array{sentence, semantic}> :param in_context:str: optional if no data pairs present use context """ try: self._out_path = (input_path + '.' + file_extender) if isNotEmptyString( file_extender) else input_path print("Store Results [", self._out_path, "]") if isNotNone(self._out_path): if isNotNone(in_elements): self._elements = in_elements self.StoreListElements() if isNotNone(in_context): self._context = in_context self.StoreStringContext() if (isNone(in_elements) and isNone(in_context)): print("No Input was given for the FileWriter!") else: print("Given path for FileWriter was None!") except Exception as ex: template = "An exception of type {0} occurred in [FileWriter.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def __init__( self, word: str, result_limit: int, base_url: str = "http://api.corpora.uni-leipzig.de/ws/sentences/", corpus: str = "deu_news_2012_1M", task: str = "sentences"): """ The constructor for the ApiCaller. :param word:str: desired word :param result_limit:int: limit of results :param base_url:str: base url of the api providing server :param corpus:str=: the desired corpus :param task:str="sentences": the desired task """ try: self._search_word = word if (isNotNone(word) and isNotEmptyString(word)) else None self._search_limit = result_limit if ( isNotNone(result_limit) and isInt(result_limit)) else 1 self._base_url = base_url if ( isNotNone(base_url) and isNotEmptyString(base_url) ) else "http://api.corpora.uni-leipzig.de/ws/sentences/" self._corpus = corpus if ( isNotNone(corpus) and isNotEmptyString(corpus)) else "deu_news_2012_1M" self._task = task if (isNotNone(task) and isNotEmptyString(task)) else "sentences" self._search_url_param = "?limit=" self._search_url = None except Exception as ex: template = "An exception of type {0} occurred in [UniLeipzigAPICaller.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def StoreStringContext(self): """ This function saves stringified context into a given file. """ try: with open(self._out_path, 'w+', encoding=self.ENCODING) as fileOut: if isNotNone(self._context) and isStr(self._context): fileOut.write(self._context) fileOut.flush() except ValueError: print('WRONG INPUT FOR [FileWriter.StoreStringContext]') except Exception as ex: template = "An exception of type {0} occurred in [FileWriter.StoreStringContext]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def CategoriesReader(self): """ This function provides file to category objects list file reader. """ try: collected_sentences: list = [] category_name: str = None categories: list = [] with open(self._path, 'r+', encoding=self.ENCODING) as fileIn: for line in fileIn.readlines(): line = line.replace("\n", "") if "[" in line and "]" in line: if isNotNone(category_name) and hasContent( collected_sentences): categories.append( Category(category_name, collected_sentences)) collected_sentences = [] category_name = None category_name = line.replace("[", "").replace("]", "") else: if len(line) > 0: collected_sentences.append(line) if isNotNone(category_name) and hasContent( collected_sentences): categories.append( Category(category_name, collected_sentences)) return categories except Exception as ex: template = "An exception of type {0} occurred in [FileReader.CategoriesReader]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def GetRequestJson(self): """ This function returns the json response. """ try: self.UrlBuilder() if isNotNone(self._search_url): response = requests.get(self._search_url) if isNotNone(response) and response.status_code is 200: json_content = json.loads(response.content) if json_content["count"] > 0: return json_content #else: # if (input("Request failed on ["+self._search_word+"]! Retry? (j/n)") is "j"): # self.GetRequestJson() print("Request failed on [" + self._search_word + "]!") return None except Exception as ex: template = "An exception of type {0} occurred in [UniLeipzigAPICaller.GetRequestJson]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def PlotSummary(self, file_name: str = None, show_shapes: bool = True): """ This method print the model summary. :param file_name:str=None: :param show_shapes:bool=True: """ try: file_name = (file_name + "_ModelGraph.png" ) if isNotNone(file_name) else "ann__ModelGraph.png" plot_model(self._model, to_file=file_name, show_shapes=show_shapes) except Exception as ex: template = "An exception of type {0} occurred in [Model.PlotSummary]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def StoreListElements(self): """ This function save a string collection to a given file. """ try: with open(self._out_path, 'w+', encoding=self.ENCODING) as fileOut: for elem in self._elements: if isNotNone(elem): fileOut.write(elem + "\n") fileOut.flush() except ValueError: print('WRONG INPUT FOR [FileWriter.StoreListElements]') except Exception as ex: template = "An exception of type {0} occurred in [FileWriter.StoreListElements]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def GetFoundSentences(self): """ This function returns the sentences from get response. """ try: if (self._task is "sentences"): sentences_list = [] json = self.GetRequestJson() if isNotNone(json): for sentence_obj in json['sentences']: sentences_list.append(sentence_obj['sentence']) return sentences_list else: return None except Exception as ex: template = "An exception of type {0} occurred in [UniLeipzigAPICaller.GetFoundSentences]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def CleanSentences(self, sentence: str = None): """ This method returns a cleaned sentence. :param sentence:str: the sentence to clean """ try: if isNotNone(sentence): text: str = re.sub(self._cleanup_rex, " ", sentence) text = text.lower().split() stops = set(stopwords.words(self._stopword_language)) text = [w for w in text if not w in stops] return " ".join(text) else: print("Empty sentences expelled!") return None except Exception as ex: template = "An exception of type {0} occurred in [Doc2VecHandler.CleanSentences]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def __init__(self, init_shape: tuple, model_folder: str, categories: int = -1): """ The constructor. :param init_shape:tuple: train data input shape :param model_folder:str: folder where the model staff should be saved :param categories:int: amount of categories """ try: self._init_shape: tuple = init_shape self._categories: int = categories self._model_folder: str = model_folder if isNotNone( model_folder) else None except Exception as ex: template = "An exception of type {0} occurred in [Model.Constructor]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def GenerateDatasetSamples(self): """ This method generates a list of samples out of the word model list. """ try: if CheckAnyListElementSameType(self._word_samples, Word): samples:list = [] for word in self._word_samples: for category in word.GetCategories(): for sentence in category.GetSentences(): #TODO size of string and amount of minimum datasets please right here! sample = Sample(word=word.GetName(), category=category.GetName(), sentence=sentence) if isNotNone(sample): samples.append(sample) return samples else: return None except Exception as ex: template = "An exception of type {0} occurred in [SampleGenerator.GenerateDatasetSamples]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def CreateDatasetFolder(self, name: str = None): """ This class creates the initial dataset folder on None input or a desired dataset subfolder on given name" :param name:str: name of a dataset subfolder """ try: folderpath = self._main_path + name if isNotNone( name) else self._main_path self._foldername = None if not os.path.exists(folderpath): os.mkdir(folderpath) if os.path.exists(): print("Successfully created the directory %s " % name) else: print("Failed to create directory %s !" % name) except Exception as ex: template = "An exception of type {0} occurred in [FolderManager.CreateDatasetFolder]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)
def PredictAndVisualize(self, test_set, categories: list, decode_dict: dict, test_words: list, test_docs: list, submission_file_name: str = "submission_all.csv", isClasses: bool = True): """ This method predict results for a given test set and store it in a csv file. :param test_set: predictable input :param categories:list: proba lables :param decode_dict:dict: class decoder dicts :param test_words:list: prediction words :param test_docs:list: predictions docs :param submission_file_name:str: name of the prdicted results file :param isClasses:bool: predict classes otherwise probability """ try: mapped: dict = {} predictions = self._model.predict_classes( test_set) if isClasses else self._model.predict_proba(test_set) submission_file_name = submission_file_name if isNotNone( submission_file_name) else "submission_all.csv" if isNotNone(test_words): mapped['Words'] = test_words if isNotNone(test_docs): mapped['Docs'] = test_docs if isNotNone(predictions): mapped['PredClasses'] = predictions if isNotNone(decode_dict) and isNotNone(predictions): mapped['ClassNames'] = [decode_dict[p] for p in predictions] submission = pd.DataFrame(mapped) if categories != None: submission.columns = categories submission.to_csv(submission_file_name, index=False) submission.head() except Exception as ex: template = "An exception of type {0} occurred in [Model.PredictAndVisualize]. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message)