Esempio n. 1
0
def worker_search_in_file(args):
    f, o = args
    handler = open(f, "r")
    f_text = handler.read()
    conc = []
    if bool(o['IgnorePOS']) == True:
        pos_delim = o['POSDelim']
        pos_tag = "[A-Z\.,-]{2,}[A-Z$0-9+*]*?"
        tag_str = "{0}{1}".format(pos_delim, pos_tag)
        f_text = re.sub(r"{0}\s".format(tag_str), r" ", f_text)
    if bool(o['IgnoreXML']) == True:
        f_text = re.sub(r"<[^>]+>", r"", f_text)
        f_text = re.sub(r"&lt;[^&]+&gt;", r" ", f_text)
    handler.close()
    matches = re.finditer(o['RegExPattern'], f_text)
    if matches:
        for n, m in enumerate(matches):
            s = m.start()
            e = m.end()
            c = {}
            c['Key'] = m.group(0)
            if o['ContextUnit'] == "Characters":
                c['Right'] = f_text[e:e + int(o['ContextRight'])]
                c['Right'] = c['Right'].replace("\n",
                                                " ").replace("\t",
                                                             " ").strip()
                c['Right'] = re.sub("\s+", " ", c['Right'])
                c['Left'] = f_text[s - int(o['ContextLeft']):s]
                c['Left'] = c['Left'].replace("\n", " ").replace("\t",
                                                                 " ").strip()
                c['Left'] = re.sub("\s+", " ", c['Left'])
            else:
                # assume that each word = 30 characters long , to be reasonably safe
                l_char = int(o['ContextLeft']) * 30
                r_char = int(o['ContextRight']) * 30
                c['Right'] = misc.tokenize_str(f_text[e:e + r_char],
                                               o['WordRegex'])
                c['Left'] = misc.tokenize_str(f_text[s - l_char:s],
                                              o['WordRegex'])
                c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):])
                c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])])

            c['Corpus'] = o['Corpus']
            if "CorpusIndex" in o.keys():
                c['CorpusIndex'] = o['CorpusIndex']
            c['Filename'] = os.path.basename(f)
            c['N'] = n  # number of match within file, used for sorting after filename
            conc.append(c)
    return conc
Esempio n. 2
0
def worker_search_in_file(args):
    f, o = args
    handler = open(f, "r")
    f_text = handler.read()
    conc = []
    if bool(o['IgnorePOS']) == True:
        pos_delim = o['POSDelim']
        pos_tag = "[A-Z\.,-]{2,}[A-Z$0-9+*]*?"
        tag_str = "{0}{1}".format(pos_delim, pos_tag)
        f_text = re.sub(r"{0}\s".format(tag_str), r" ", f_text)
    if bool(o['IgnoreXML']) == True:
        f_text = re.sub(r"<[^>]+>", r"", f_text)
        f_text = re.sub(r"&lt;[^&]+&gt;", r" ", f_text)
    handler.close()
    matches = re.finditer(o['RegExPattern'], f_text)
    if matches:
        for n, m in enumerate(matches):
            s = m.start()
            e = m.end()
            c = {}
            c['Key'] = m.group(0)
            if o['ContextUnit'] == "Characters":
                c['Right'] = f_text[e:e + int(o['ContextRight'])]
                c['Right'] = c['Right'].replace(
                    "\n", " ").replace("\t", " ").strip()
                c['Right'] = re.sub("\s+", " ", c['Right'])
                c['Left'] = f_text[s - int(o['ContextLeft']):s]
                c['Left'] = c['Left'].replace(
                    "\n", " ").replace("\t", " ").strip()
                c['Left'] = re.sub("\s+", " ", c['Left'])
            else:
                # assume that each word = 30 characters long , to be reasonably safe
                l_char = int(o['ContextLeft']) * 30
                r_char = int(o['ContextRight']) * 30
                c['Right'] = misc.tokenize_str(f_text[e:e + r_char],
                                               o['WordRegex'])
                c['Left'] = misc.tokenize_str(f_text[s - l_char:s], o['WordRegex'])
                c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):])
                c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])])

            c['Corpus'] = o['Corpus']
            if "CorpusIndex" in o.keys():
                c['CorpusIndex'] = o['CorpusIndex']
            c['Filename'] = os.path.basename(f)
            c['N'] = n  # number of match within file, used for sorting after filename
            conc.append(c)
    return conc
Esempio n. 3
0
def search_in_file(f, o):
    handler = open(f, "r")
    f_text = handler.read()
    conc = []
    if bool(o['IgnorePOSTags']) == True:
        pos_delim = o['POSTagDelimiter']
        pos_tag = "[A-Z]{2,}[A-Z$0-9+*]*?"
        tag_str = "{0}{1}".format(pos_delim, pos_tag)
        f_text = re.sub(r"{0}\s".format(tag_str), r"", f_text)
    if bool(o['IgnoreXMLTags']) == True:
        f_text = re.sub(r"<[^>]+>", r"", f_text)
        f_text = re.sub(r"&lt;[^&]+&gt;", r"", f_text)
    handler.close()
    matches = re.finditer(o['RegExPattern'], f_text)
    if matches:
        for m in matches:
            s = m.start()
            e = m.end()
            c = {}
            c['Key'] = m.group(0)
            if o['ContextUnit'] == "Characters":
                c['Right'] = f_text[e:e + int(o['ContextRight'])]
                c['Right'] = c['Right'].replace(
                    "\n", " ").replace("\t", " ").strip()
                c['Right'] = re.sub("\s+", " ", c['Right'])
                c['Left'] = f_text[s - int(o['ContextLeft']):s]
                c['Left'] = c['Left'].replace(
                    "\n", " ").replace("\t", " ").strip()
                c['Left'] = re.sub("\s+", " ", c['Left'])
            else:
                # assume that each word = 30 characters long , to be reasonably
                # safe
                l_char = int(o['ContextLeft']) * 20
                r_char = int(o['ContextRight']) * 20
                c['Right'] = misc.tokenize_str(f_text[e:e + r_char],
                                                   o['WordCharsRegex'])
                c['Left'] = misc.tokenize_str(f_text[s - l_char:s],
                                                  o['WordCharsRegex'])
                c['Left'] = " ".join(c['Left'][-int(o['ContextRight']):])
                c['Right'] = " ".join(c['Right'][0:int(o['ContextRight'])])

            c['Filename'] = os.path.basename(f)
            conc.append(c)
    return conc
Esempio n. 4
0
def search_in_file(f, o):
    handler = open(f, "r")
    f_text = handler.read()
    conc = []
    if bool(o["IgnorePOSTags"]) == True:
        pos_delim = o["POSTagDelimiter"]
        pos_tag = "[A-Z]{2,}[A-Z$0-9+*]*?"
        tag_str = "{0}{1}".format(pos_delim, pos_tag)
        f_text = re.sub(r"{0}\s".format(tag_str), r"", f_text)
    if bool(o["IgnoreXMLTags"]) == True:
        f_text = re.sub(r"<[^>]+>", r"", f_text)
        f_text = re.sub(r"&lt;[^&]+&gt;", r"", f_text)
    handler.close()
    matches = re.finditer(o["RegExPattern"], f_text)
    if matches:
        for m in matches:
            s = m.start()
            e = m.end()
            c = {}
            c["Key"] = m.group(0)
            if o["ContextUnit"] == "Characters":
                c["Right"] = f_text[e : e + int(o["ContextRight"])]
                c["Right"] = c["Right"].replace("\n", " ").replace("\t", " ").strip()
                c["Right"] = re.sub("\s+", " ", c["Right"])
                c["Left"] = f_text[s - int(o["ContextLeft"]) : s]
                c["Left"] = c["Left"].replace("\n", " ").replace("\t", " ").strip()
                c["Left"] = re.sub("\s+", " ", c["Left"])
            else:
                # assume that each word = 30 characters long , to be reasonably
                # safe
                l_char = int(o["ContextLeft"]) * 20
                r_char = int(o["ContextRight"]) * 20
                c["Right"] = misc.tokenize_str(f_text[e : e + r_char], o["WordCharsRegex"])
                c["Left"] = misc.tokenize_str(f_text[s - l_char : s], o["WordCharsRegex"])
                c["Left"] = " ".join(c["Left"][-int(o["ContextRight"]) :])
                c["Right"] = " ".join(c["Right"][0 : int(o["ContextRight"])])

            c["Filename"] = os.path.basename(f)
            conc.append(c)
    return conc