def prev_next_obj(loader_obj, text, depth=5): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth] record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['sortedtoms']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type in record_dict: record_dict[type].attrib['next'] = ' '.join(id) if type in object_types: print >> output_file, record_dict[type] else: del record_dict[type].attrib['next'] del record_dict[type].attrib['prev'] print >> output_file, record_dict[type] record.attrib['prev'] = ' '.join(record_dict[type].id) record_dict[type] = record else: record.attrib['prev'] = '' record_dict[type] = record object_types.reverse() for obj in object_types: record_dict[obj].attrib['next'] = '' print >> output_file, record_dict[obj] output_file.close() os.remove(text['sortedtoms']) tomscommand = "cat %s | egrep \"^doc|^div|^para\" | sort %s > %s" % ( temp_file, loader_obj.sort_by_id, text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def normalize_these_columns(loader_obj,text): current_values = {} tmp_file = open(text["sortedtoms"] + ".tmp","w") for column in columns: current_values[column] = "" for line in open(text["sortedtoms"]): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type == "div1": for column in columns: if column in record.attrib: current_values[column] = record.attrib[column] else: current_values[column] = "" elif type == "div2": for column in columns: if column in record.attrib: current_values[column] = record.attrib[column] elif type == "div3": for column in columns: if column not in record.attrib: record.attrib[column] = current_values[column] print >> tmp_file, record tmp_file.close() os.remove(text["sortedtoms"]) os.rename(text["sortedtoms"] + ".tmp",text["sortedtoms"])
def prev_next_obj(loader_obj, text, depth=5): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth] record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['sortedtoms']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type in record_dict: record_dict[type].attrib['next'] = ' '.join(id) if type in object_types: print >> output_file, record_dict[type] else: del record_dict[type].attrib['next'] del record_dict[type].attrib['prev'] print >> output_file, record_dict[type] record.attrib['prev'] = ' '.join(record_dict[type].id) record_dict[type] = record else: record.attrib['prev'] = '' record_dict[type] = record object_types.reverse() for obj in object_types: record_dict[obj].attrib['next'] = '' print >> output_file, record_dict[obj] output_file.close() os.remove(text['sortedtoms']) tomscommand = "cat %s | egrep \"^doc|^div|^para\" | sort %s > %s" % (temp_file,loader_obj.sort_by_id,text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def inner_prev_next_obj(loader_obj, text): record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['sortedtoms']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type in record_dict: record_dict[type].attrib['next'] = ' '.join(id) if type in types: print >> output_file, record_dict[type] else: del record_dict[type].attrib['next'] del record_dict[type].attrib['prev'] print >> output_file, record_dict[type] record.attrib['prev'] = ' '.join(record_dict[type].id) record_dict[type] = record else: record.attrib['prev'] = '' record_dict[type] = record types.reverse() for obj in types: try: record_dict[obj].attrib['next'] = '' print >> output_file, record_dict[obj] except KeyError: pass output_file.close() os.remove(text['sortedtoms']) type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (temp_file,type_pattern,loader_obj.sort_by_id,text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def get_word_counts(_, text): """Lowercase and count words""" attrib_set = set() with open(text["raw"] + ".tmp", "w") as tmp_file: object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] counts = [0 for i in range(5)] with open(text["raw"], encoding="utf8") as fh: for line in fh: philo_type, word, id, attrib = line.split("\t") id = id.split() record = Record(philo_type, word, id) record.attrib = loads(attrib) if philo_type == "word": word = word.lower() for d, _ in enumerate(counts): if philo_type == "word": counts[d] += 1 elif philo_type == object_types[d]: record.attrib["word_count"] = counts[d] counts[d] = 0 print(record, file=tmp_file) attrib_set.update(record.attrib.keys()) os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"]) return attrib_set
def normalize_these_columns(loader_obj, text): current_values = {} tmp_file = open(text["sortedtoms"] + ".tmp", "w") for column in columns: current_values[column] = "" for line in open(text["sortedtoms"]): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = loads(attrib) if type == "div1": for column in columns: if column in record.attrib: current_values[column] = record.attrib[column] else: current_values[column] = "" elif type == "div2": for column in columns: if column in record.attrib: current_values[column] = record.attrib[column] elif type == "div3": for column in columns: if column not in record.attrib: record.attrib[column] = current_values[column] print(record, file=tmp_file) tmp_file.close() os.remove(text["sortedtoms"]) os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
def load_record(line): philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) record.attrib["prev"] = "" record.attrib["next"] = "" return record
def load_record(line): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = loads(attrib) record.attrib["prev"] = "" record.attrib["next"] = "" return record
def tag_words(loader_obj, text): # Set up the treetagger process tt_args = [ tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file ] ttout_fh = open(text["raw"] + ".ttout", "w") tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh) raw_fh = open(text["raw"], "r") line_count = 0 # read through the object file, pass the words to treetagger for line in raw_fh: type, word, id, attrib = line.split('\t') id = id.split() if type == "word": word = word.decode('utf-8', 'ignore').lower().encode('utf-8') # close and re-open the treetagger process to prevent garbage output. if line_count > maxlines: tt_worker.stdin.close() tt_worker.wait() new_ttout_fh = open(text["raw"] + ".ttout", "a") tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh) line_count = 0 print >> tt_worker.stdin, word line_count += 1 # finish tagging tt_worker.stdin.close() tt_worker.wait() # go back through the object file, and add the treetagger results to each word tmp_fh = open(text["raw"] + ".tmp", "w") tag_fh = open(text["raw"] + ".ttout", "r") for line in open(text["raw"], "r"): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type == "word": tag_l = tag_fh.readline() next_word, tag = tag_l.split("\t")[0:2] pos, lem, prob = tag.split(" ") if next_word != word.decode('utf-8', 'ignore').lower().encode('utf-8'): print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem return else: record.attrib["pos"] = pos record.attrib["lemma"] = lem print >> tmp_fh, record else: print >> tmp_fh, record os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"]) os.remove(text["raw"] + ".ttout")
def fix_pages(loader_obj,text,depth=4): """Unfinished, do not use""" object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth] current_page = 0; temp_file = open(text["sortedtoms"] + ".tmp","w") for line in open(text["sortedtoms"]): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib)
def tag_words(loader_obj, text): # Set up the treetagger process tt_args = [tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file] ttout_fh = open(text["raw"] + ".ttout", "w") tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh) raw_fh = open(text["raw"], "r") line_count = 0 # read through the object file, pass the words to treetagger for line in raw_fh: type, word, id, attrib = line.split('\t') id = id.split() if type == "word": word = word.decode('utf-8', 'ignore').lower().encode('utf-8') # close and re-open the treetagger process to prevent garbage # output. if line_count > maxlines: tt_worker.stdin.close() tt_worker.wait() new_ttout_fh = open(text["raw"] + ".ttout", "a") tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh) line_count = 0 print >> tt_worker.stdin, word line_count += 1 # finish tagging tt_worker.stdin.close() tt_worker.wait() # go back through the object file, and add the treetagger results to # each word tmp_fh = open(text["raw"] + ".tmp", "w") tag_fh = open(text["raw"] + ".ttout", "r") for line in open(text["raw"], "r"): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = loads(attrib) if type == "word": tag_l = tag_fh.readline() next_word, tag = tag_l.split("\t")[0:2] pos, lem, prob = tag.split(" ") if next_word != word.decode('utf-8', 'ignore').lower().encode('utf-8'): print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem return else: record.attrib["pos"] = pos record.attrib["lemma"] = lem print >> tmp_fh, record else: print >> tmp_fh, record os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"]) os.remove(text["raw"] + ".ttout")
def normalize_unicode_raw_words(loader_obj, text): tmp_file = open(text["raw"] + ".tmp","w") for line in open(text["raw"]): rec_type, word, id, attrib = line.split('\t') id = id.split() if rec_type == "word": word = word.decode("utf-8").lower().encode("utf-8") record = Record(rec_type, word, id) record.attrib = eval(attrib) print >> tmp_file, record os.remove(text["raw"]) os.rename(text["raw"] + ".tmp",text["raw"])
def normalize_unicode_raw_words(loader_obj, text): tmp_file = open(text["raw"] + ".tmp", "w") for line in open(text["raw"]): rec_type, word, id, attrib = line.split('\t') id = id.split() if rec_type == "word": word = word.decode("utf-8").lower().encode("utf-8") record = Record(rec_type, word, id) record.attrib = eval(attrib) print >> tmp_file, record os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"])
def normalize_unicode_raw_words(loader_obj, text): tmp_file = open(text["raw"] + ".tmp", "w") with open(text["raw"]) as filehandle: for line in filehandle: rec_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() if rec_type == "word": word = word.lower() record = Record(rec_type, word, philo_id) record.attrib = loads(attrib) print(record, file=tmp_file) tmp_file.close() os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"])
def normalize_unicode_raw_words(loader_obj, text): tmp_file = open(text["raw"] + ".tmp", "w") with open(text["raw"]) as fh: for line in fh: rec_type, word, id, attrib = line.split('\t') id = id.split() if rec_type == "word": word = word.decode("utf-8").lower().encode("utf-8") record = Record(rec_type, word, id) record.attrib = loads(attrib) print(record, file=tmp_file) tmp_file.close() os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"])
def inner_prev_next_obj(loader_obj, text): """Store the previous and next object for every object passed to this function By default, this is doc, div1, div2, div3.""" record_dict = {} temp_file = text["raw"] + ".tmp" output_file = open(temp_file, "w") attrib_set = set() with open(text["sortedtoms"]) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split("\t") philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) if philo_type in record_dict: record_dict[philo_type].attrib["next"] = " ".join(philo_id) if philo_type in philo_types: print(record_dict[philo_type], file=output_file) else: del record_dict[philo_type].attrib["next"] del record_dict[philo_type].attrib["prev"] print(record_dict[philo_type], file=output_file) record.attrib["prev"] = " ".join( record_dict[philo_type].id) record_dict[philo_type] = record else: record.attrib["prev"] = "" record_dict[philo_type] = record attrib_set.update(record.attrib.keys()) philo_types.reverse() for obj in philo_types: try: record_dict[obj].attrib["next"] = "" print(record_dict[obj], file=output_file) except KeyError: pass output_file.close() os.remove(text["sortedtoms"]) philo_type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = 'cat %s | egrep "%s" | LANG=C sort %s > %s' % ( temp_file, philo_type_pattern, loader_obj.sort_by_id, text["sortedtoms"], ) os.system(tomscommand) os.remove(temp_file) return attrib_set
def inner_make_object_ancestors(loader_obj, text): temp_file = text['words'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['words']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) for type in types: zeros_to_add = ['0' for i in range(7 - type_depth[type])] philo_id = id[:type_depth[type]] + zeros_to_add record.attrib[type + '_ancestor'] = ' '.join(philo_id) print >> output_file, record output_file.close() os.remove(text['words']) os.rename(temp_file, text['words'])
def inner_make_object_ancestors(loader_obj, text): temp_file = text['words'] + '.tmp' output_file = open(temp_file, 'w') with open(text['words']) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) for philo_type in philo_types: zeros_to_add = ['0' for i in range(7 - philo_type_depth[philo_type])] philo_id = philo_id[:philo_type_depth[philo_type]] + zeros_to_add record.attrib[philo_type + '_ancestor'] = ' '.join(philo_id) print(record, file=output_file) output_file.close() os.remove(text['words']) os.rename(temp_file, text['words'])
def smash_these_unicode_columns(loader_obj, text): tmp_file = open(text["sortedtoms"] + ".tmp", "w") for line in open(text["sortedtoms"]): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = loads(attrib) for column in columns: if column in record.attrib: # print >> sys.stderr, repr(record.attrib) col = record.attrib[column].decode("utf-8") col = col.lower() smashed_col = [c for c in unicodedata.normalize("NFKD", col) if not unicodedata.combining(c)] record.attrib[column + "_norm"] = ''.join(smashed_col).encode("utf-8") print(record, file=tmp_file) tmp_file.close() os.remove(text["sortedtoms"]) os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
def make_word_counts(loader_obj, text, depth=4): object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] counts = [0 for i in range(depth)] temp_file = text["raw"] + ".tmp" output_file = open(temp_file, "w") for line in open(text["raw"]): type, word, id, attrib = line.split("\t") id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) for d, count in enumerate(counts): if type == "word": counts[d] += 1 elif type == object_types[d]: record.attrib["word_count"] = counts[d] counts[d] = 0 print >> output_file, record output_file.close() os.remove(text["raw"]) os.rename(temp_file, text["raw"])
def make_word_counts(loader_obj, text, depth=5): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'] counts = [0 for i in range(depth)] temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['raw']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) for d,count in enumerate(counts): if type == 'word': counts[d] += 1 elif type == object_types[d]: record.attrib['word_count'] = counts[d] counts[d] = 0 print >> output_file, record output_file.close() os.remove(text['raw']) os.rename(temp_file, text['raw'])
def make_word_counts(loader_obj, text, depth=4): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'] counts = [0 for i in range(depth)] temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['raw']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) for d, count in enumerate(counts): if type == 'word': counts[d] += 1 elif type == object_types[d]: record.attrib['word_count'] = counts[d] counts[d] = 0 print >> output_file, record output_file.close() os.remove(text['raw']) os.rename(temp_file, text['raw'])
def inner_prev_next_obj(loader_obj, text): """Store the previous and next object for every object passed to this function By default, this is doc, div1, div2, div3.""" record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') with open(text['sortedtoms']) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) if philo_type in record_dict: record_dict[philo_type].attrib['next'] = ' '.join(philo_id) if philo_type in philo_types: print(record_dict[philo_type], file=output_file) else: del record_dict[philo_type].attrib['next'] del record_dict[philo_type].attrib['prev'] print(record_dict[philo_type], file=output_file) record.attrib['prev'] = ' '.join( record_dict[philo_type].id) record_dict[philo_type] = record else: record.attrib['prev'] = '' record_dict[philo_type] = record philo_types.reverse() for obj in philo_types: try: record_dict[obj].attrib['next'] = '' print(record_dict[obj], file=output_file) except KeyError: pass output_file.close() os.remove(text['sortedtoms']) philo_type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % ( temp_file, philo_type_pattern, loader_obj.sort_by_id, text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def make_word_counts(loader_obj, text, depth=5): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'] counts = [0 for i in range(depth)] temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') with open(text['raw']) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) for d, count in enumerate(counts): if philo_type == 'word': counts[d] += 1 elif philo_type == object_types[d]: record.attrib['word_count'] = counts[d] counts[d] = 0 print(record, file=output_file) output_file.close() os.remove(text['raw']) os.rename(temp_file, text['raw'])
def inner_make_object_ancestors(loader_obj, text): temp_file = text["words"] + ".tmp" output_file = open(temp_file, "w") with open(text["words"]) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split("\t") philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) for philo_type in philo_types: zeros_to_add = [ "0" for i in range(7 - philo_type_depth[philo_type]) ] philo_id = philo_id[:philo_type_depth[ philo_type]] + zeros_to_add record.attrib[philo_type + "_ancestor"] = " ".join(philo_id) print(record, file=output_file) output_file.close() os.remove(text["words"]) os.rename(temp_file, text["words"])
def smash_these_unicode_columns(loader_obj, text): tmp_file = open(text["sortedtoms"] + ".tmp", "w") for line in open(text["sortedtoms"]): philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) for column in columns: if column in record.attrib: # print >> sys.stderr, repr(record.attrib) col = record.attrib[column] col = col.lower() smashed_col = [ c for c in unicodedata.normalize("NFKD", col) if not unicodedata.combining(c) ] record.attrib[column + "_norm"] = ''.join(smashed_col) print(record, file=tmp_file) tmp_file.close() os.remove(text["sortedtoms"]) os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
def inner_make_object_ancestors(loader_obj, text): temp_file = text['words'] + '.tmp' output_file = open(temp_file, 'w') with open(text['words']) as filehandle: for line in filehandle: philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) for philo_type in philo_types: zeros_to_add = [ '0' for i in range(7 - philo_type_depth[philo_type]) ] philo_id = philo_id[:philo_type_depth[ philo_type]] + zeros_to_add record.attrib[philo_type + '_ancestor'] = ' '.join(philo_id) print(record, file=output_file) output_file.close() os.remove(text['words']) os.rename(temp_file, text['words'])
def inner_prev_next_obj(loader_obj, text): """Store the previous and next object for every object passed to this function By default, this is doc, div1, div2, div3.""" record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') with open(text['sortedtoms']) as fh: for line in fh: type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = loads(attrib) if type in record_dict: record_dict[type].attrib['next'] = ' '.join(id) if type in types: print(record_dict[type], file=output_file) else: del record_dict[type].attrib['next'] del record_dict[type].attrib['prev'] print(record_dict[type], file=output_file) record.attrib['prev'] = ' '.join(record_dict[type].id) record_dict[type] = record else: record.attrib['prev'] = '' record_dict[type] = record types.reverse() for obj in types: try: record_dict[obj].attrib['next'] = '' print(record_dict[obj], file=output_file) except KeyError: pass output_file.close() os.remove(text['sortedtoms']) type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (temp_file, type_pattern, loader_obj.sort_by_id, text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def word_frequencies_per_obj(loader_obj, text, depth=1): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth] files_path = loader_obj.destination + '/WORK/' try: os.mkdir(files_path) except OSError: ## Path was already created pass for d, obj in enumerate(object_types): file = text['name'] + '.%s.sorted' % obj output = open(files_path + file, 'w') d = d + 1 old_philo_id = [] records = {} for line in open(text['words']): type, word, id, attrib = line.split('\t') attrib = eval(attrib) philo_id = id.split() record = Record(type, word, philo_id) count_key = obj + '_token_count' byte = attrib['byte_start'] del attrib['byte_start'] record.attrib = {count_key: attrib[count_key]} if philo_id[:d] != old_philo_id[:d]: if records: for w in records: print >> output, records[w] records = {} if word not in records: record.attrib['bytes'] = [] record.attrib['bytes'] = str(byte) records[word] = record else: records[word].attrib['bytes'] += ' ' + str(byte) old_philo_id = philo_id for w in records: print >> output, records[w] output.close()
def make_token_counts(loader_obj, text, depth=5): old_word = None record_list = [] temp_file = text['words'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['words']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if word == old_word or old_word == None: record_list.append(record) else: count_tokens(record_list, depth, output_file) record_list = [] record_list.append(record) old_word = word if len(record_list) != 0: count_tokens(record_list, depth, output_file) record_list = [] os.remove(text['words']) os.rename(temp_file, text['words'])
def inner_word_frequencies_per_obj(loader_obj,text): files_path = loader_obj.destination + '/WORK/' try: os.mkdir(files_path) except OSError: ## Path was already created pass for obj, d in obj_types.iteritems(): file = text['name'] + '.%s.freq_counts' % obj output = open(files_path + file, 'w') old_philo_id = [] old_word = '' records = {} for line in open(text['words']): type, word, id, attrib = line.split('\t') attrib = eval(attrib) ## Dismiss all irrelevant fields while making sure we still have 9 fields in the end philo_id = id.split()[:d] + [0 for i in range(7-d)] + [0,0] record = Record(type, word, philo_id) count_key = obj + '_token_count' byte = attrib['byte_start'] del attrib['byte_start'] record.attrib = {'token_count': attrib[count_key]} if philo_id[:d] != old_philo_id[:d] or word != old_word: if records and old_word: for w in records: print >> output, records[w] records = {} if word not in records: record.attrib['bytes'] = [] record.attrib['bytes']= str(byte) records[word] = record else: records[word].attrib['bytes'] += ' ' + str(byte) old_philo_id = philo_id old_word = word for w in records: print >> output, records[w] output.close()
def prev_next_obj(loader_obj, text, depth=4): object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"][:depth] record_dict = {} temp_file = text["raw"] + ".tmp" output_file = open(temp_file, "w") for line in open(text["sortedtoms"]): type, word, id, attrib = line.split("\t") id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type in record_dict: record_dict[type].attrib["next"] = " ".join(id) if type in object_types: print >> output_file, record_dict[type] else: del record_dict[type].attrib["next"] del record_dict[type].attrib["prev"] print >> output_file, record_dict[type] record.attrib["prev"] = " ".join(record_dict[type].id) record_dict[type] = record else: record.attrib["prev"] = "" record_dict[type] = record object_types.reverse() for obj in object_types: record_dict[obj].attrib["next"] = "" print >> output_file, record_dict[obj] output_file.close() os.remove(text["sortedtoms"]) type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = 'cat %s | egrep "%s" | sort %s > %s' % ( temp_file, type_pattern, loader_obj.sort_by_id, text["sortedtoms"], ) os.system(tomscommand) os.remove(temp_file)
def word_frequencies_per_obj(loader_obj, text, depth=1): object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth] files_path = loader_obj.destination + '/WORK/' try: os.mkdir(files_path) except OSError: ## Path was already created pass for d, obj in enumerate(object_types): file = text['name'] + '.%s.sorted' % obj output = open(files_path + file, 'w') d = d + 1 old_philo_id = [] records = {} for line in open(text['words']): type, word, id, attrib = line.split('\t') attrib = eval(attrib) philo_id = id.split() record = Record(type, word, philo_id) count_key = obj + '_token_count' byte = attrib['byte_start'] del attrib['byte_start'] record.attrib = {count_key: attrib[count_key]} if philo_id[:d] != old_philo_id[:d]: if records: for w in records: print >> output, records[w] records = {} if word not in records: record.attrib['bytes'] = [] record.attrib['bytes']= str(byte) records[word] = record else: records[word].attrib['bytes'] += ' ' + str(byte) old_philo_id = philo_id for w in records: print >> output, records[w] output.close()
def smash_these_unicode_columns(loader_obj, text): tmp_file = open(text["sortedtoms"] + ".tmp", "w") for line in open(text["sortedtoms"]): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) for column in columns: if column in record.attrib: # print >> sys.stderr, repr(record.attrib) col = record.attrib[column].decode("utf-8") col = col.lower() smashed_col = [ c for c in unicodedata.normalize("NFKD", col) if not unicodedata.combining(c) ] record.attrib[column + "_norm"] = ''.join( smashed_col).encode("utf-8") #record.attrib[column + "_norm"] = ''.join([c.encode("utf-8") for c in unicodedata.normalize('NFKD',record.attrib[column].decode("utf-8").lower()) if not unicodedata.combining(c)]) print >> tmp_file, record tmp_file.close() os.remove(text["sortedtoms"]) os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
def inner_prev_next_obj(loader_obj, text): record_dict = {} temp_file = text['raw'] + '.tmp' output_file = open(temp_file, 'w') for line in open(text['sortedtoms']): type, word, id, attrib = line.split('\t') id = id.split() record = Record(type, word, id) record.attrib = eval(attrib) if type in record_dict: record_dict[type].attrib['next'] = ' '.join(id) if type in types: print >> output_file, record_dict[type] else: del record_dict[type].attrib['next'] del record_dict[type].attrib['prev'] print >> output_file, record_dict[type] record.attrib['prev'] = ' '.join(record_dict[type].id) record_dict[type] = record else: record.attrib['prev'] = '' record_dict[type] = record types.reverse() for obj in types: try: record_dict[obj].attrib['next'] = '' print >> output_file, record_dict[obj] except KeyError: pass output_file.close() os.remove(text['sortedtoms']) type_pattern = "|".join("^%s" % t for t in loader_obj.types) tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % ( temp_file, type_pattern, loader_obj.sort_by_id, text["sortedtoms"]) os.system(tomscommand) os.remove(temp_file)
def return_record(line): rec_type, word, id, attrib = line.split('\t') id = id.split() record = Record(rec_type, word, id) record.attrib = eval(attrib) return rec_type, word.decode('utf-8'), record
def tag_words(loader_obj, text): # Set up the treetagger process tt_args = [ tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file ] ttout_filehandle = open(text["raw"] + ".ttout", "w") tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_filehandle) raw_filehandle = open(text["raw"], "r") line_count = 0 # read through the object file, pass the words to treetagger for line in raw_filehandle: philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() if philo_type == "word": word = word.lower() # close and re-open the treetagger process to prevent garbage # output. if line_count > maxlines: tt_worker.stdin.close() tt_worker.wait() new_ttout_filehandle = open(text["raw"] + ".ttout", "a") tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_filehandle) line_count = 0 print(word, file=tt_worker.stdin) line_count += 1 # finish tagging tt_worker.stdin.close() tt_worker.wait() # go back through the object file, and add the treetagger results to # each word tmp_filehandle = open(text["raw"] + ".tmp", "w") tag_filehandle = open(text["raw"] + ".ttout", "r") for line in open(text["raw"], "r"): philo_type, word, philo_id, attrib = line.split('\t') philo_id = philo_id.split() record = Record(philo_type, word, philo_id) record.attrib = loads(attrib) if philo_type == "word": tag_l = tag_filehandle.readline() next_word, tag = tag_l.split("\t")[0:2] pos, lem, prob = tag.split(" ") if next_word != word.lower(): print("TREETAGGER ERROR:", next_word, " != ", word, pos, lem, file=sys.stderr) return else: record.attrib["pos"] = pos record.attrib["lemma"] = lem print(record, file=tmp_filehandle) else: print(record, file=tmp_filehandle) os.remove(text["raw"]) os.rename(text["raw"] + ".tmp", text["raw"]) os.remove(text["raw"] + ".ttout")