def worker_search_in_file(args): f, o = args handler = open(f, "r") f_text = handler.read() conc = [] if bool(o['IgnorePOS']) == True: pos_delim = o['POSDelim'] pos_tag = "[A-Z\.,-]{2,}[A-Z$0-9+*]*?" tag_str = "{0}{1}".format(pos_delim, pos_tag) f_text = re.sub(r"{0}\s".format(tag_str), r" ", f_text) if bool(o['IgnoreXML']) == True: f_text = re.sub(r"<[^>]+>", r"", f_text) f_text = re.sub(r"<[^&]+>", r" ", f_text) handler.close() matches = re.finditer(o['RegExPattern'], f_text) if matches: for n, m in enumerate(matches): s = m.start() e = m.end() c = {} c['Key'] = m.group(0) if o['ContextUnit'] == "Characters": c['Right'] = f_text[e:e + int(o['ContextRight'])] c['Right'] = c['Right'].replace("\n", " ").replace("\t", " ").strip() c['Right'] = re.sub("\s+", " ", c['Right']) c['Left'] = f_text[s - int(o['ContextLeft']):s] c['Left'] = c['Left'].replace("\n", " ").replace("\t", " ").strip() c['Left'] = re.sub("\s+", " ", c['Left']) else: # assume that each word = 30 characters long , to be reasonably safe l_char = int(o['ContextLeft']) * 30 r_char = int(o['ContextRight']) * 30 c['Right'] = misc.tokenize_str(f_text[e:e + r_char], o['WordRegex']) c['Left'] = misc.tokenize_str(f_text[s - l_char:s], o['WordRegex']) c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):]) c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])]) c['Corpus'] = o['Corpus'] if "CorpusIndex" in o.keys(): c['CorpusIndex'] = o['CorpusIndex'] c['Filename'] = os.path.basename(f) c['N'] = n # number of match within file, used for sorting after filename conc.append(c) return conc
def worker_search_in_file(args): f, o = args handler = open(f, "r") f_text = handler.read() conc = [] if bool(o['IgnorePOS']) == True: pos_delim = o['POSDelim'] pos_tag = "[A-Z\.,-]{2,}[A-Z$0-9+*]*?" tag_str = "{0}{1}".format(pos_delim, pos_tag) f_text = re.sub(r"{0}\s".format(tag_str), r" ", f_text) if bool(o['IgnoreXML']) == True: f_text = re.sub(r"<[^>]+>", r"", f_text) f_text = re.sub(r"<[^&]+>", r" ", f_text) handler.close() matches = re.finditer(o['RegExPattern'], f_text) if matches: for n, m in enumerate(matches): s = m.start() e = m.end() c = {} c['Key'] = m.group(0) if o['ContextUnit'] == "Characters": c['Right'] = f_text[e:e + int(o['ContextRight'])] c['Right'] = c['Right'].replace( "\n", " ").replace("\t", " ").strip() c['Right'] = re.sub("\s+", " ", c['Right']) c['Left'] = f_text[s - int(o['ContextLeft']):s] c['Left'] = c['Left'].replace( "\n", " ").replace("\t", " ").strip() c['Left'] = re.sub("\s+", " ", c['Left']) else: # assume that each word = 30 characters long , to be reasonably safe l_char = int(o['ContextLeft']) * 30 r_char = int(o['ContextRight']) * 30 c['Right'] = misc.tokenize_str(f_text[e:e + r_char], o['WordRegex']) c['Left'] = misc.tokenize_str(f_text[s - l_char:s], o['WordRegex']) c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):]) c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])]) c['Corpus'] = o['Corpus'] if "CorpusIndex" in o.keys(): c['CorpusIndex'] = o['CorpusIndex'] c['Filename'] = os.path.basename(f) c['N'] = n # number of match within file, used for sorting after filename conc.append(c) return conc
def search_in_file(f, o): handler = open(f, "r") f_text = handler.read() conc = [] if bool(o['IgnorePOSTags']) == True: pos_delim = o['POSTagDelimiter'] pos_tag = "[A-Z]{2,}[A-Z$0-9+*]*?" tag_str = "{0}{1}".format(pos_delim, pos_tag) f_text = re.sub(r"{0}\s".format(tag_str), r"", f_text) if bool(o['IgnoreXMLTags']) == True: f_text = re.sub(r"<[^>]+>", r"", f_text) f_text = re.sub(r"<[^&]+>", r"", f_text) handler.close() matches = re.finditer(o['RegExPattern'], f_text) if matches: for m in matches: s = m.start() e = m.end() c = {} c['Key'] = m.group(0) if o['ContextUnit'] == "Characters": c['Right'] = f_text[e:e + int(o['ContextRight'])] c['Right'] = c['Right'].replace( "\n", " ").replace("\t", " ").strip() c['Right'] = re.sub("\s+", " ", c['Right']) c['Left'] = f_text[s - int(o['ContextLeft']):s] c['Left'] = c['Left'].replace( "\n", " ").replace("\t", " ").strip() c['Left'] = re.sub("\s+", " ", c['Left']) else: # assume that each word = 30 characters long , to be reasonably # safe l_char = int(o['ContextLeft']) * 20 r_char = int(o['ContextRight']) * 20 c['Right'] = misc.tokenize_str(f_text[e:e + r_char], o['WordCharsRegex']) c['Left'] = misc.tokenize_str(f_text[s - l_char:s], o['WordCharsRegex']) c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):]) c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])]) c['Filename'] = os.path.basename(f) conc.append(c) return conc
def search_in_file(f, o): handler = open(f, "r") f_text = handler.read() conc = [] if bool(o["IgnorePOSTags"]) == True: pos_delim = o["POSTagDelimiter"] pos_tag = "[A-Z]{2,}[A-Z$0-9+*]*?" tag_str = "{0}{1}".format(pos_delim, pos_tag) f_text = re.sub(r"{0}\s".format(tag_str), r"", f_text) if bool(o["IgnoreXMLTags"]) == True: f_text = re.sub(r"<[^>]+>", r"", f_text) f_text = re.sub(r"<[^&]+>", r"", f_text) handler.close() matches = re.finditer(o["RegExPattern"], f_text) if matches: for m in matches: s = m.start() e = m.end() c = {} c["Key"] = m.group(0) if o["ContextUnit"] == "Characters": c["Right"] = f_text[e : e + int(o["ContextRight"])] c["Right"] = c["Right"].replace("\n", " ").replace("\t", " ").strip() c["Right"] = re.sub("\s+", " ", c["Right"]) c["Left"] = f_text[s - int(o["ContextLeft"]) : s] c["Left"] = c["Left"].replace("\n", " ").replace("\t", " ").strip() c["Left"] = re.sub("\s+", " ", c["Left"]) else: # assume that each word = 30 characters long , to be reasonably # safe l_char = int(o["ContextLeft"]) * 20 r_char = int(o["ContextRight"]) * 20 c["Right"] = misc.tokenize_str(f_text[e : e + r_char], o["WordCharsRegex"]) c["Left"] = misc.tokenize_str(f_text[s - l_char : s], o["WordCharsRegex"]) c["Left"] = " ".join(c["Left"][-int(o["ContextRight"]) :]) c["Right"] = " ".join(c["Right"][0 : int(o["ContextRight"])]) c["Filename"] = os.path.basename(f) conc.append(c) return conc