def __init__(self): self.exclude = EXCLUDE self.stop_words = STOP_WORDS self.prefixes = PREFIXES #currently prefixes are not used self.ner = Adeptner()
class Processor(object): def __init__(self): self.exclude = EXCLUDE self.stop_words = STOP_WORDS self.prefixes = PREFIXES #currently prefixes are not used self.ner = Adeptner() def folderToGraph(self, root="corpus", target_folder=None, graph=None, ner=False, lemmatize=True, stemming='medium', min_word_length=1): """ Creates a target folder and related subdirectories according to the structure of the root folder. Then, processes all the textual files in the root structure and store the results in the target structure. :root The root folder contains years subfolders that in turn contain the '.txt' documents. :target_folder The folderwhere to store the processed files. :graph The graph object. :ner Use named entity recognition. :stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'. :min_word_length Minimum number of characters for a word to be kept. :return None """ if not os.path.isdir(root): raise Exception("'{0}' folder doesn't exist".format(root)) if graph: if not isinstance(graph, Graph): raise Exception("The graph object is non valid") graph.connect() if target_folder and not os.path.isdir( target_folder): #create 'target' folder if it doesn't exist os.mkdir(target_folder) print("...preprocessing documents and building the graph") count_nodes = 0 count_edges = 0 for dir in os.listdir( root): #visit year subdirectories of 'root' folder dir_origin = os.path.join(root, dir) if os.path.isdir(dir_origin) and dir.isdigit( ): #don't visit directories whose name is not a year year = int(dir) if target_folder: dir_dest = os.path.join(target_folder, dir) if not os.path.isdir( dir_dest ): #create year subfolders if they don't exist in 'target' os.mkdir(dir_dest) for file in os.listdir( dir_origin ): #open textual files under year folder and process them if file.endswith(".txt"): string = self.fileToString( os.path.join(dir_origin, file)) #put file into string entities = self.processString(string, ner, lemmatize, stemming, min_word_length) if entities: if DEBUG: print(entities) if graph: cn, ce = graph.addToGraph(entities, int(year)) count_nodes += cn count_edges += ce if target_folder: self.stringToFile( os.path.join(dir_dest, file), entities) #save string to file graph.commit() graph.close() print("added {0} nodes to {1} ".format(count_nodes, graph.graph_nodes)) print("added {0} edges to {1} ".format(count_edges, graph.graph_edges)) def processString(self, string, ner=False, lemmatize=True, stemming=None, min_word_length=1): """ Apply prerpocessing to a string. :ner Use named entity recognition (if True) or text preprocessing (if False) to extract entities. :stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'. :min_word_length Minimum number of characters for a word to be kept. :return Cleaned up string """ entities = [] result = [] if ner: entities = self.getEntities( string) #perform named entity recognition else: string = self.removeStopwords( string) #remove the stopwords in STOP_WORDS entities = string.split(" ") #this creates a bag of words if entities: for ent in entities: ent = self.cleanText(ent) #remove punctuation and numbers if lemmatize: ent = self.lemmatizeText(ent) #perform lemmatization if stemming: ent = self.stemText(ent, intensity=stemming) #perform stemming if min_word_length > 0: ent = self.removeShortWords( ent, min_word_length) #remove short words ent = ent.strip() #remove trailing spaces if ent: result.append(ent) result = list(set(result)) #remove duplicates return result def fileToString(self, file_path): """Take the content of a file and put it in a string.""" text = "" if file_path: with open(file_path, "r") as f: text = f.read() #omit size to read it all f.close() return text def stringToFile(self, file_path, text): """Take the content of a string and put it in a '.txt' file.""" if file_path and text: with open(file_path, "w") as f: f.write(text) f.close() def getEntities(self, text): """Apply Named Entity Recognition from Stanford's web service ADEPTA.""" return self.ner.getTerms(text).get('MEDTERM') def lemmatizeText(self, text): """Apply lemmatization to a string.""" l = stem.WordNetLemmatizer() bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(l.lemmatize(word)) return ' '.join(result) def stemText(self, text, intensity): """Apply stemming to a string according to :intesity.""" #select nltk stemmer if intensity is 'light': s = stem.PorterStemmer() elif intensity is 'medium': s = stem.snowball.EnglishStemmer() elif intensity is 'heavy': s = stem.LancasterStemmer() else: raise Exception( "'{0}' is not a correct intensity parameter. Must be light, medium or heavy." .format(intensity)) bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(s.stem(word)) return ' '.join(result) def cleanText(self, text): """ 1. remove the punctuation specified in EXCLUDE 2. remove numbers 3. transform to lowercase 4. remove multiple spaces """ new_text = "" for c in text: if c in self.exclude or c.isdigit(): new_text = new_text + " " else: new_text = new_text + c new_text = new_text.lower() return ' '.join(new_text.split()) #this removes double spaces def removeShortWords(self, text, min_word_length): """Remove words that are shorter than :min_word_length.""" bow = text.split(" ") #this creates a bag of words result = [] for word in bow: if len(word) >= min_word_length: result.append(word) return ' '.join(result) def removeStopwords(self, text): """Remove the stopwords specified in STOP_WORDS.""" bow = text.split(" ") #this creates a bag of words result = [] for word in bow: if word not in self.stop_words: result.append(word) return ' '.join(result)
class Processor(object): def __init__(self): self.exclude = EXCLUDE self.stop_words = STOP_WORDS self.prefixes = PREFIXES #currently prefixes are not used self.ner = Adeptner() def folderToGraph(self, root="corpus", target_folder=None, graph=None, ner=False, lemmatize=True, stemming='medium', min_word_length=1): """ Creates a target folder and related subdirectories according to the structure of the root folder. Then, processes all the textual files in the root structure and store the results in the target structure. :root The root folder contains years subfolders that in turn contain the '.txt' documents. :target_folder The folderwhere to store the processed files. :graph The graph object. :ner Use named entity recognition. :stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'. :min_word_length Minimum number of characters for a word to be kept. :return None """ if not os.path.isdir(root): raise Exception("'{0}' folder doesn't exist".format(root)) if graph: if not isinstance(graph, Graph): raise Exception("The graph object is non valid") graph.connect() if target_folder and not os.path.isdir(target_folder): #create 'target' folder if it doesn't exist os.mkdir(target_folder) print("...preprocessing documents and building the graph") count_nodes = 0 count_edges = 0 for dir in os.listdir(root): #visit year subdirectories of 'root' folder dir_origin = os.path.join(root, dir) if os.path.isdir(dir_origin) and dir.isdigit(): #don't visit directories whose name is not a year year = int(dir) if target_folder: dir_dest = os.path.join(target_folder, dir) if not os.path.isdir(dir_dest): #create year subfolders if they don't exist in 'target' os.mkdir(dir_dest) for file in os.listdir(dir_origin): #open textual files under year folder and process them if file.endswith(".txt"): string = self.fileToString(os.path.join(dir_origin, file)) #put file into string entities = self.processString(string, ner, lemmatize, stemming, min_word_length) if entities: if DEBUG: print(entities) if graph: cn, ce = graph.addToGraph(entities, int(year)) count_nodes += cn count_edges += ce if target_folder: self.stringToFile(os.path.join(dir_dest, file), entities) #save string to file graph.commit() graph.close() print("added {0} nodes to {1} ".format(count_nodes, graph.graph_nodes)) print("added {0} edges to {1} ".format(count_edges, graph.graph_edges)) def processString(self, string, ner=False, lemmatize=True, stemming=None, min_word_length=1): """ Apply prerpocessing to a string. :ner Use named entity recognition (if True) or text preprocessing (if False) to extract entities. :stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'. :min_word_length Minimum number of characters for a word to be kept. :return Cleaned up string """ entities = [] result = [] if ner: entities = self.getEntities(string) #perform named entity recognition else: string = self.removeStopwords(string) #remove the stopwords in STOP_WORDS entities = string.split(" ") #this creates a bag of words if entities: for ent in entities: ent = self.cleanText(ent) #remove punctuation and numbers if lemmatize: ent = self.lemmatizeText(ent) #perform lemmatization if stemming: ent = self.stemText(ent, intensity=stemming) #perform stemming if min_word_length > 0: ent = self.removeShortWords(ent, min_word_length) #remove short words ent = ent.strip() #remove trailing spaces if ent: result.append(ent) result = list(set(result)) #remove duplicates return result def fileToString(self, file_path): """Take the content of a file and put it in a string.""" text = "" if file_path: with open(file_path, "r") as f: text = f.read() #omit size to read it all f.close() return text def stringToFile(self, file_path, text): """Take the content of a string and put it in a '.txt' file.""" if file_path and text: with open(file_path, "w") as f: f.write(text) f.close() def getEntities(self, text): """Apply Named Entity Recognition from Stanford's web service ADEPTA.""" return self.ner.getTerms(text).get('MEDTERM') def lemmatizeText(self, text): """Apply lemmatization to a string.""" l = stem.WordNetLemmatizer() bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(l.lemmatize(word)) return ' '.join(result) def stemText(self, text, intensity): """Apply stemming to a string according to :intesity.""" #select nltk stemmer if intensity is 'light': s = stem.PorterStemmer() elif intensity is 'medium': s = stem.snowball.EnglishStemmer() elif intensity is 'heavy': s = stem.LancasterStemmer() else: raise Exception("'{0}' is not a correct intensity parameter. Must be light, medium or heavy.".format(intensity)) bow = text.split(" ") #this creates a bag of words result = [] for word in bow: result.append(s.stem(word)) return ' '.join(result) def cleanText(self, text): """ 1. remove the punctuation specified in EXCLUDE 2. remove numbers 3. transform to lowercase 4. remove multiple spaces """ new_text = "" for c in text: if c in self.exclude or c.isdigit(): new_text = new_text + " " else: new_text = new_text + c new_text = new_text.lower() return ' '.join(new_text.split()) #this removes double spaces def removeShortWords(self, text, min_word_length): """Remove words that are shorter than :min_word_length.""" bow = text.split(" ") #this creates a bag of words result = [] for word in bow: if len(word)>=min_word_length: result.append(word) return ' '.join(result) def removeStopwords(self, text): """Remove the stopwords specified in STOP_WORDS.""" bow = text.split(" ") #this creates a bag of words result = [] for word in bow: if word not in self.stop_words: result.append(word) return ' '.join(result)