def beginProcess(info, identifier, gdbm_files, filter_file, path, category): log = logging.getLogger('classify') log.debug("classify.beginProcess()") if identifier == "w": url_obj = url.url(gdbm_files, filter_file, path, category) url_obj.processHTML(info) elif identifier == "f": textfile_obj = textfile.textfile(gdbm_files, filter_file, path, category) textfile_obj.processUTFFile(info) elif identifier == "h": textfile_obj = textfile.textfile(gdbm_files, filter_file, path, category) textfile_obj.processHTMLFile(info) else: log.debug("identifier value is not valid") return log.debug("program terminated") return
def import_Tonly(self, file): if ((file != None) and (file != '') and (file != ())): self.all_spans = namelistmap() self.all_rels = {} self.all_kstructs = {} with open(file, mode='r', encoding='utf-8') as f: file_content = f.read() file_ident = str(file) self.doc = document(ident=file_ident, content=file_content) self.data = self.doc.ctnt self.curr_file = file self.data_parse = textfile(in_text=self.data, configuration=self.config)
def import_TandA(self, file): self.all_spans = namelistmap() self.all_rels = {} self.all_kstructs = {} with open(file, mode='r', encoding='utf-8') as f: file_content = f.read() doc_reg = re.compile('document\(.*\)', re.DOTALL | re.IGNORECASE) if doc_reg.match(file_content): self.doc = eval(file_content) self.data = self.doc.ctnt self.curr_file = file self.data_parse = textfile(in_text=self.data, configuration=self.config) else: print('ERROR tentative to import a text only file!')
def __init__(self, data=None, config_path=None): self.config = { } # a dictionary holding attribute value associations for configuring some annotation application, e.g. the treetagger: { 'TREEAGGER_DIR':'/people/koroleva/Desktop/src/TreeTagger'} if (config_path is not None): assert (type(config_path is str) and os.path.isfile(config_path)) f = open(config_path, 'r') self.config = eval(f.read()) f.close() # given to namespace, the list of various type names used to build object names (unique identifiers) Annotate.SPAN_TYP = 'SPAN' Annotate.MWU_TYP = 'MWU' Annotate.SRC_TYP = 'SRC' Annotate.TRGT_TYP = 'TRGT' Annotate.REL_TYP = 'REL' Annotate.CONSTRU_TYP = 'CONSTRU' Annotate.ALL_TYPS = [ Annotate.SPAN_TYP, Annotate.MWU_TYP, Annotate.SRC_TYP, Annotate.TRGT_TYP, Annotate.REL_TYP, Annotate.CONSTRU_TYP ] self.name_mgr = namespace(Annotate.ALL_TYPS) self.curr_file = None self.data = '' self.wn = 0 if (type(data) is document): self.doc = data self.data = self.doc.ctnt else: if (type(data) is str): self.doc = document(ident='', content=data, metadata='') self.data = data else: assert (data is None) self.doc = document(ident='', content='', metadata='') self.data = data self.sentences = [] self.data_parse = textfile(in_text=self.data, configuration=self.config)
commonscore = compdict(text1, text2) return commonscore + modescore + lenscore + lensenscore def compdict(text1, text2): wordst1 = set(map(lambda x: x[0], text1.sortedwords)) wordst2 = set(map(lambda x: x[0], text2.sortedwords)) temptotal = 0 for word in (wordst1 & wordst2): diffword = text1.words[word] - text1.words[word] temptotal += -1*diffword*diffword + 10 return (len(wordst1 & wordst2) * 50) + temptotal files = [] for i in xrange(30): files.append(textfile(str(i+1) + ".txt")) pairs = [] for j in xrange(30): pairlist = [] try: for i in xrange(30): if i != j and not files[i].link and not files[j].link: pairlist.append(tuple([compscore(files[i], files[j]), i, j])) maxpair = max(pairlist, key=operator.itemgetter(0)) files[maxpair[1]].link = True files[maxpair[2]].link = True pairs.append(maxpair[1:]) except: continue