def worker_search_in_file(args): f, o = args tree = etree.parse(f) root = tree.getroot() speech_nodes = root.findall('.//speech') results = [] for s in speech_nodes: speech_text = obctools.prepare_speech(s) a = {} a['Gender'] = s.get('sex', '') a['Trial'] = s.get('trial', '').replace("o", "t").replace("t-", "t") a['Year'] = s.get('year', '') if a['Year']=='': a['Year'] = a['Trial'][1:5] a['2-Periods'] = obctools.get_periods(a['Year'],2) a['3-Periods'] = obctools.get_periods(a['Year'],3) a['4-Periods'] = obctools.get_periods(a['Year'],4) a['5-Periods'] = obctools.get_periods(a['Year'],5) a['6-Periods'] = obctools.get_periods(a['Year'],6) a['Speaker role'] = s.get('role', '') a['HISCLASS'] = s.get('HISCLASS', '') a['2-Class'] = obctools.convert_hisclass_to_binclass(a['HISCLASS']) a['HISCO code'] = s.get('HISCO-code', '') a['HISCO label'] = s.get('HISCO-label', '') a['Speaker-ID'] = s.get('speaker', '').replace( " ", "").replace("-", "").strip() a['Printer'] = s.get('printer', '') a['Publisher'] = s.get('publisher', '') a['Scribe'] = s.get('scribe', '') a['Editor'] = s.get('editor', '') for m in o['regex'].finditer(speech_text): r = {} r.update(a) if o['show_pos'] == 1: r['Key'] = m.group(0) r['Left'] = speech_text[:m.start()][-o['context_left']:] r['Right'] = speech_text[m.end():][:o['context_right']] elif o['show_pos_key'] == 1: r['Key'] = m.group(0) r['Left'] = speech_text[:m.start()][-o['context_left']:] r['Right'] = speech_text[m.end():][:o['context_right']] r['Left'] = obctools.strip_pos_tags(r['Left']) r['Right'] = obctools.strip_pos_tags(r['Right']) else: r['Key'] = obctools.strip_pos_tags(m.group(0)) r['Left'] = speech_text[:m.start()][-o['context_left']:] r['Right'] = speech_text[m.end():][:o['context_right']] r['Left'] = obctools.strip_pos_tags(r['Left']) r['Right'] = obctools.strip_pos_tags(r['Right']) r['Filename'] = os.path.basename(f) r['Search name'] = o['search_label'] results.append(r) return results
def finish_subcorpus(self): if self.subcorpus_results: pc = len(self.subcorpus_results) wc, uc = self.get_word_and_utterance_count(self.subcorpus_results) self.info_label["text"] = "{} utterances ({}) were selected" \ " from {} proceedings.".format(uc, wc, pc) subcorpus_path = os.path.join(self.main.tool_path, "subcorpora") if obctools.make_dir(subcorpus_path): subcorpus_path = os.path.join(subcorpus_path, self.filename) if self.subcorpus_format.get() == ".txt": with open(subcorpus_path, "w") as handler: for r in self.subcorpus_results: for sn in r['Nodes']: s = obctools.prepare_speech(sn) handler.write(s) handler.write("\n\n") else: tree = etree.fromstring('<subcorpus filename="{0}"></subcorpus>'.format(self.filename)) xml = etree.ElementTree(tree) root = xml.getroot() root.set("utterances", str(uc)) root.set("words", str(wc)) root.set("proceedings", str(pc)) for r in self.subcorpus_results: for sn in r['Nodes']: sn.set("filename",r['Filename']) root.append(sn) with open(subcorpus_path,"wb") as handler: try: xml.write(handler, xml_declaration=True, encoding="utf-8") except IOError as e: print(e) self.info_label["text"] = "{:,} utterances ({:,} words) were selected" \ " from {} proceedings" \ " & saved as {}".format(uc, wc, len(self.subcorpus_results), os.path.basename(self.filename)) #self.info_label.config(text="Done. Subcorpus saved as {0}.".format(os.path.basename(self.filename))) self.subcorpus_results = None self.main.root.update_idletasks()
def worker_select_from_file(args): f, o = args tree = etree.parse(f) root = tree.getroot() r = {} r['Filename'] = os.path.basename(f) r['Nodes'] = [] speech_nodes = root.findall('.//speech') results = [] for s in speech_nodes: criteria = [] speech_text = obctools.prepare_speech(s) a = {} a['Gender'] = s.get('sex', 'u') if a["Gender"] == "": a["Gender"] = "u" if len(o['Gender'])>0: if a['Gender'] in o['Gender']: criteria.append(True) else: criteria.append(False) if len(o['Speaker role'])>0: a['Speaker role'] = s.get('role', 'u').lower() if a["Speaker role"] == "": a["Speaker role"] = "u" if a["Speaker role"] in o["Speaker role"]: criteria.append(True) else: criteria.append(False) if len(o['HISCLASS'])>0: a['HISCLASS'] = s.get('HISCLASS', 'u') if a["HISCLASS"] == "": a["HISCLASS"] = "u" if a["HISCLASS"] in o["HISCLASS"]: criteria.append(True) else: criteria.append(False) if False not in criteria: r['Nodes'].append(s) results.append(r) return results, r['Filename']