def do_load(self,line): ''' Loads all texts in a directory.''' log('Loading texts and jobs. Might take a minute.') del self.texts[:] del self.jobs[:] self.loadAllJobs() self.loadAllTexts()
def run_profile(self, focal, comparison, stopwords, delimiters, maxcost): log('Running profile: ' + focal + '_' + comparison + '_' + stopwords + '_' + delimiters) for t in self.texts: t.generateProfile(self.getClass(focal), self.getClass(comparison), self.getClass(stopwords), self.getClass(delimiters), maxcost)
def nodify(self): ignores = self.parsehandler.ignore.chars classes = self.parsehandler.classes file = codecs.open(self.path,encoding='utf-8') log("Generating nodes for " + self.id) # Position in file pos = 0 node_handler = NodeHandler(self.parsehandler) # Disable garbage collector while looping gc.disable() while True: # we read the entire file one character at at time s = file.read(1) current = s # If not current, we've hit the end of the file if not current: break # only continue (i.e., increment the count, etc) # if we aren't ignoring this character if (current in ignores): node_handler.clearQueue() continue pos = pos + 1 for cc in classes: for set in cc.chars: for char in set: if current == char: node_handler.add(Node(current,cc,pos,set)) node_handler.clearQueue() self.nodes = node_handler.nodes[:] file.close() self.charnum = pos gc.enable() log("\tThere were " + str(self.charnum) + " characters, and " + str(len(self.nodes)) + " nodes.")
def loadAllTexts(self): for file in os.listdir(self.dirpath): if file[0] != '.': log('Loading ' + file + '.') self.loadText(self.dirpath + '/' + file) for text in self.texts: text.nodify()
def nodify(self): ignores = self.parsehandler.ignore.chars classes = self.parsehandler.classes file = codecs.open(self.path, encoding='utf-8') log("Generating nodes for " + self.id) # Position in file pos = 0 node_handler = NodeHandler(self.parsehandler) # Disable garbage collector while looping gc.disable() while True: # we read the entire file one character at at time s = file.read(1) current = s # If not current, we've hit the end of the file if not current: break # only continue (i.e., increment the count, etc) # if we aren't ignoring this character if (current in ignores): node_handler.clearQueue() continue pos = pos + 1 for cc in classes: for set in cc.chars: for char in set: if current == char: node_handler.add(Node(current, cc, pos, set)) node_handler.clearQueue() self.nodes = node_handler.nodes[:] file.close() self.charnum = pos gc.enable() log("\tThere were " + str(self.charnum) + " characters, and " + str(len(self.nodes)) + " nodes.")
def do_jobs(self, line): '''Quick command for do all in job_batch function.''' del self.jobs[:] self.loadAllJobs() for j in self.jobs: self.run_profile(j[0], j[1], j[2], j[3], j[4]) log('Done running job batch.')
def run_profile(self,focal,comparison,stopwords,delimiters,maxcost): log('Running profile: ' + focal + '_' + comparison + '_' + stopwords + '_' + delimiters) for t in self.texts: t.generateProfile(self.getClass(focal), self.getClass(comparison), self.getClass(stopwords), self.getClass(delimiters), maxcost)
def getColocations(self, abscost): colocations = [] log("Compare class was : " + self.compare.id) for f in self.focals[:]: for e in f.edges: if e.cc == self.compare.id and abs(e.cost) <= abscost: colocations.append(f) return colocations
def getColocations(self,abscost): colocations = [] log("Compare class was : " + self.compare.id) for f in self.focals[:]: for e in f.edges: if e.cc == self.compare.id and abs(e.cost) <= abscost: colocations.append(f) return colocations
def printProfile(self): log("\n#### Profile ####") log("Focals: " + self.focal.id) log("Compares: " + self.compare.id) for f in self.focals: log("\n") f.printNode()
def countInSentence(self): log("Started counting in sentence for " + self.id) count = 0 for f in self.focals: edges = f.edges f_pos = f.pos closest = self.getClosestTwoDelimiterPositions(f_pos, edges) left = closest[0] right = closest[1] for e in edges: pos = e.pos if (e.cc == self.compare.id and ((pos > left and pos < f_pos) or (pos < right and pos > f_pos))): count = count + 1 return count
def countInSentence(self): log("Started counting in sentence for " + self.id) count = 0 for f in self.focals: edges = f.edges f_pos = f.pos closest = self.getClosestTwoDelimiterPositions(f_pos,edges) left = closest[0] right = closest[1] for e in edges: pos = e.pos if (e.cc == self.compare.id and ((pos > left and pos < f_pos) or (pos < right and pos > f_pos))): count = count + 1 return count
def printProfile(self): log("\n#### Profile ####") log("Focals: " + self.focal.id) log("Compares: " + self.compare.id) for k, v in self.edge_count.items(): print('Edge count (' + str(k) + '): ' + str(v)) for k, v in self.contingency.items(): print('Contingency table (' + str(k) + '): ' + str(v))
def generateProfile(self, focal, compare, stopword, delim, maxcost=120): focals = [] stopwords = [] delims = [] compares = [] new_list = [] for n in self.nodes: x = Node(n.char, n.cc, n.pos, n.key) new_list.append(x) log(len(new_list)) # Sort the nodes into their correct categories for n in new_list: # for n in self.nodes[:]: if n.cc == focal: focals.append(n) elif n.cc == stopword: stopwords.append(n) elif n.cc == delim: delims.append(n) elif n.cc == compare: compares.append(n) p = NodeProfile(focals, stopwords, delims, compares, focal, compare, stopword, delim, maxcost) self.profiles.append(p)
def printEdge(self): log("\t\t#### Edge ####") log("\t\tClass: " + self.cc) log("\t\tId: " + self.id) log("\t\tCost: " + str(self.cost)) log("\t\tAbsolute cost: " + str(abs(self.cost)) + "\n")
def printNode(self): log("\t#### Node ####") log("\tClass: " + self.cc.id) log("\tKey: " + self.key) for e in self.edges: e.printEdge()
def getClass(self, id): for c in self.classes: if c.id == id: return c log('No class found for ' + id) return None
def generateEdges(self): log("Generating edges for " + self.id) max = self.maxcost neg_max = (-1 * max) edge_count = 0 # The list position of the first found element so we don't need to # keep checking the beginning of the list when abs(cost) > maxcost first_stop = 0 first_delim = 0 first_compare = 0 # Optimizations tricks stopword = self.stopword.id delim = self.delim.id for f in self.focals: gc.disable() f_pos = f.pos # For each newly-minted focal node, determine the distance to # each stopword if the node is within maxcost absolute distance. # This is because we don't want to count these words towards the # distance of future nodes. found_first_stop = False stop_index = first_stop for s in self.stopwords[first_stop:]: s_cost = f_pos - s.pos # Stop at upper searching bound if s_cost < neg_max: break # Double-checking constraints, might not be necessary if abs(s_cost) <= max and s_cost != 0: # Set first matched stop character to # lower bound for searching if found_first_stop == False: found_first_stop = True first_stop = (stop_index) f.add(Edge(s, s_cost)) edge_count += 1 stop_index += 1 # Do the same for the delimiters. found_first_delim = False delim_index = first_delim for d in self.delims[first_delim:]: d_pos = d.pos d_cost = f_pos - d_pos d_takeaway = 0 if d_cost < neg_max: break for e in f.edges: e_pos = e.pos # If this edge is between the delimiter and the focal # character, and it's a stopword, we'll need to account # for the position difference since stopwords are (sometimes) # to be considered the same as whitespace. if (((e_pos > d_pos and e_pos < f_pos) or (e_pos < d_pos and e_pos > f_pos)) and (e.cc == stopword)): d_takeaway += 1 # Decrease the absolute cost if d_cost < 0: d_cost += d_takeaway elif d_cost > 0: d_cost -= d_takeaway if abs(d_cost) <= max and d_cost != 0: if found_first_delim == False: found_first_delim = True first_delim = (delim_index) f.add(Edge(d, d_cost)) edge_count += 1 delim_index += 1 # Now we can calculate the compares by the distance ignoring # stopwords and delimiters, giving a better true distance found_first_compare = False compare_index = first_compare for c in self.compares[first_compare:]: c_pos = c.pos c_cost = f_pos - c_pos if c_cost < neg_max: break takeaway = 0 for e in f.edges: e_pos = e.pos # Similarly to above, both stopwords and delimiters # are considered to not count towards edge costs for # comparison characters. if (((e_pos > c_pos and e_pos < f_pos) or (e_pos < c_pos and e_pos > f_pos)) and (e.cc == stopword or e.cc == delim)): takeaway += 1 if c_cost < 0: c_cost += takeaway elif c_cost > 0: c_cost -= takeaway if abs(c_cost) <= max and c_cost != 0: if found_first_compare == False: found_first_compare = True first_compare = (compare_index) f.add(Edge(c, c_cost)) edge_count += 1 compare_index += 1 gc.enable() log("Edge count was " + str(edge_count))
def getClass(self,id): for c in self.classes: if c.id == id: return c log('No class found for ' + id) return None
def printNode(self): log("\t#### Node ####") log("\tClass: " + self.cc.id) log("\tKey: " + self.key)
def do_jobs(self,line): '''Quick command for do all in job_batch function.''' for j in self.jobs: self.run_profile(j[0], j[1], j[2], j[3], j[4]) log('Done running job batch.')
def generateEdges(self): log("Generating edges for " + self.id) max = self.maxcost neg_max = (-1 * max) edge_count = 0 # The list position of the first found element so we don't need to # keep checking the beginning of the list when abs(cost) > maxcost first_stop = 0 first_delim = 0 first_compare = 0 # Optimizations tricks stopword = self.stopword.id delim = self.delim.id for f in self.focals: gc.disable() f_pos = f.pos # For each newly-minted focal node, determine the distance to # each stopword if the node is within maxcost absolute distance. # This is because we don't want to count these words towards the # distance of future nodes. found_first_stop = False stop_index = first_stop for s in self.stopwords[first_stop:]: s_cost = f_pos - s.pos # Stop at upper searching bound if s_cost < neg_max: break # Double-checking constraints, might not be necessary if abs(s_cost) <= max and s_cost != 0: # Set first matched stop character to # lower bound for searching if found_first_stop == False: found_first_stop = True first_stop = (stop_index) f.add(Edge(s,s_cost)) edge_count += 1 stop_index += 1 # Do the same for the delimiters. found_first_delim = False delim_index = first_delim for d in self.delims[first_delim:]: d_pos = d.pos d_cost = f_pos - d_pos d_takeaway = 0 if d_cost < neg_max: break for e in f.edges: e_pos = e.pos # If this edge is between the delimiter and the focal # character, and it's a stopword, we'll need to account # for the position difference since stopwords are (sometimes) # to be considered the same as whitespace. if (((e_pos > d_pos and e_pos < f_pos) or (e_pos < d_pos and e_pos > f_pos)) and (e.cc == stopword)): d_takeaway += 1 # Decrease the absolute cost if d_cost < 0: d_cost += d_takeaway elif d_cost > 0: d_cost -= d_takeaway if abs(d_cost) <= max and d_cost != 0: if found_first_delim == False: found_first_delim = True first_delim = (delim_index) f.add(Edge(d,d_cost)) edge_count += 1 delim_index += 1 # Now we can calculate the compares by the distance ignoring # stopwords and delimiters, giving a better true distance found_first_compare = False compare_index = first_compare for c in self.compares[first_compare:]: c_pos = c.pos c_cost = f_pos - c_pos if c_cost < neg_max: break takeaway = 0 for e in f.edges: e_pos = e.pos # Similarly to above, both stopwords and delimiters # are considered to not count towards edge costs for # comparison characters. if (((e_pos > c_pos and e_pos < f_pos) or (e_pos < c_pos and e_pos > f_pos)) and (e.cc == stopword or e.cc == delim)): takeaway += 1 if c_cost < 0: c_cost += takeaway elif c_cost > 0: c_cost -= takeaway if abs(c_cost) <= max and c_cost != 0: if found_first_compare == False: found_first_compare = True first_compare = (compare_index) f.add(Edge(c,c_cost)) edge_count += 1 compare_index += 1 gc.enable() log("Edge count was " + str(edge_count))
def countAllInSentence(self): log("Started counting in sentence for " + self.id) count = 0 for f in self.focals: count += self.countInSentence(f) return count