Example #1
0
def prev_next_obj(loader_obj, text, depth=5):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent',
                    'word'][:depth]
    record_dict = {}
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['sortedtoms']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        if type in record_dict:
            record_dict[type].attrib['next'] = ' '.join(id)
            if type in object_types:
                print >> output_file, record_dict[type]
            else:
                del record_dict[type].attrib['next']
                del record_dict[type].attrib['prev']
                print >> output_file, record_dict[type]
            record.attrib['prev'] = ' '.join(record_dict[type].id)
            record_dict[type] = record
        else:
            record.attrib['prev'] = ''
            record_dict[type] = record
    object_types.reverse()
    for obj in object_types:
        record_dict[obj].attrib['next'] = ''
        print >> output_file, record_dict[obj]
    output_file.close()
    os.remove(text['sortedtoms'])
    tomscommand = "cat %s | egrep \"^doc|^div|^para\" | sort %s > %s" % (
        temp_file, loader_obj.sort_by_id, text["sortedtoms"])
    os.system(tomscommand)
    os.remove(temp_file)
Example #2
0
 def normalize_these_columns(loader_obj,text):
     current_values = {}
     tmp_file = open(text["sortedtoms"] + ".tmp","w")
     for column in columns:
         current_values[column] = ""
     for line in open(text["sortedtoms"]):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib)
         if type == "div1":
             for column in columns:
                 if column in record.attrib:
                     current_values[column] = record.attrib[column]
                 else:
                     current_values[column] = ""
         elif type == "div2":
             for column in columns:
                 if column in record.attrib:
                     current_values[column] = record.attrib[column]
         elif type == "div3":
             for column in columns:
                 if column not in record.attrib:
                     record.attrib[column] = current_values[column]
         print >> tmp_file, record
     tmp_file.close()
     os.remove(text["sortedtoms"])
     os.rename(text["sortedtoms"] + ".tmp",text["sortedtoms"])
Example #3
0
def prev_next_obj(loader_obj, text, depth=5):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth]
    record_dict = {}
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['sortedtoms']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib) 
        if type in record_dict:
            record_dict[type].attrib['next'] = ' '.join(id)
            if type in object_types:
                print >> output_file, record_dict[type]
            else:
                del record_dict[type].attrib['next']
                del record_dict[type].attrib['prev']
                print >> output_file, record_dict[type]
            record.attrib['prev'] = ' '.join(record_dict[type].id)
            record_dict[type] = record
        else:
            record.attrib['prev'] = ''
            record_dict[type] = record
    object_types.reverse()
    for obj in object_types:
        record_dict[obj].attrib['next'] = ''
        print >> output_file, record_dict[obj]
    output_file.close()
    os.remove(text['sortedtoms'])
    tomscommand = "cat %s | egrep \"^doc|^div|^para\" | sort %s > %s" % (temp_file,loader_obj.sort_by_id,text["sortedtoms"])
    os.system(tomscommand)
    os.remove(temp_file)
Example #4
0
 def inner_prev_next_obj(loader_obj, text):
     record_dict = {}
     temp_file = text['raw'] + '.tmp'
     output_file = open(temp_file, 'w')
     for line in open(text['sortedtoms']):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib) 
         if type in record_dict:
             record_dict[type].attrib['next'] = ' '.join(id)
             if type in types:
                 print >> output_file, record_dict[type]
             else:
                 del record_dict[type].attrib['next']
                 del record_dict[type].attrib['prev']
                 print >> output_file, record_dict[type]
             record.attrib['prev'] = ' '.join(record_dict[type].id)
             record_dict[type] = record
         else:
             record.attrib['prev'] = ''
             record_dict[type] = record
     types.reverse()
     for obj in types:
         try:
             record_dict[obj].attrib['next'] = ''
             print >> output_file, record_dict[obj]
         except KeyError:
             pass
     output_file.close()
     os.remove(text['sortedtoms'])
     type_pattern = "|".join("^%s" % t for t in loader_obj.types)
     tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (temp_file,type_pattern,loader_obj.sort_by_id,text["sortedtoms"])
     os.system(tomscommand)
     os.remove(temp_file)
Example #5
0
def get_word_counts(_, text):
    """Lowercase and count words"""
    attrib_set = set()
    with open(text["raw"] + ".tmp", "w") as tmp_file:
        object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"]
        counts = [0 for i in range(5)]
        with open(text["raw"], encoding="utf8") as fh:
            for line in fh:
                philo_type, word, id, attrib = line.split("\t")
                id = id.split()
                record = Record(philo_type, word, id)
                record.attrib = loads(attrib)
                if philo_type == "word":
                    word = word.lower()
                for d, _ in enumerate(counts):
                    if philo_type == "word":
                        counts[d] += 1
                    elif philo_type == object_types[d]:
                        record.attrib["word_count"] = counts[d]
                        counts[d] = 0
                print(record, file=tmp_file)
                attrib_set.update(record.attrib.keys())
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
    return attrib_set
 def normalize_these_columns(loader_obj, text):
     current_values = {}
     tmp_file = open(text["sortedtoms"] + ".tmp", "w")
     for column in columns:
         current_values[column] = ""
     for line in open(text["sortedtoms"]):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = loads(attrib)
         if type == "div1":
             for column in columns:
                 if column in record.attrib:
                     current_values[column] = record.attrib[column]
                 else:
                     current_values[column] = ""
         elif type == "div2":
             for column in columns:
                 if column in record.attrib:
                     current_values[column] = record.attrib[column]
         elif type == "div3":
             for column in columns:
                 if column not in record.attrib:
                     record.attrib[column] = current_values[column]
         print(record, file=tmp_file)
     tmp_file.close()
     os.remove(text["sortedtoms"])
     os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
Example #7
0
 def load_record(line):
     philo_type, word, philo_id, attrib = line.split('\t')
     philo_id = philo_id.split()
     record = Record(philo_type, word, philo_id)
     record.attrib = loads(attrib)
     record.attrib["prev"] = ""
     record.attrib["next"] = ""
     return record
Example #8
0
 def load_record(line):
     type, word, id, attrib = line.split('\t')
     id = id.split()
     record = Record(type, word, id)
     record.attrib = loads(attrib)
     record.attrib["prev"] = ""
     record.attrib["next"] = ""
     return record
Example #9
0
 def load_record(line):
     philo_type, word, philo_id, attrib = line.split('\t')
     philo_id = philo_id.split()
     record = Record(philo_type, word, philo_id)
     record.attrib = loads(attrib)
     record.attrib["prev"] = ""
     record.attrib["next"] = ""
     return record
 def load_record(line):
     type, word, id, attrib = line.split('\t')
     id = id.split()
     record = Record(type, word, id)
     record.attrib = loads(attrib)
     record.attrib["prev"] = ""
     record.attrib["next"] = ""
     return record
Example #11
0
    def tag_words(loader_obj, text):
        # Set up the treetagger process
        tt_args = [
            tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold",
            ".01", param_file
        ]
        ttout_fh = open(text["raw"] + ".ttout", "w")
        tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh)
        raw_fh = open(text["raw"], "r")
        line_count = 0

        # read through the object file, pass the words to treetagger
        for line in raw_fh:
            type, word, id, attrib = line.split('\t')
            id = id.split()
            if type == "word":
                word = word.decode('utf-8', 'ignore').lower().encode('utf-8')
                # close and re-open the treetagger process to prevent garbage output.
                if line_count > maxlines:
                    tt_worker.stdin.close()
                    tt_worker.wait()
                    new_ttout_fh = open(text["raw"] + ".ttout", "a")
                    tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh)
                    line_count = 0
                print >> tt_worker.stdin, word
                line_count += 1

        # finish tagging
        tt_worker.stdin.close()
        tt_worker.wait()

        # go back through the object file, and add the treetagger results to each word
        tmp_fh = open(text["raw"] + ".tmp", "w")
        tag_fh = open(text["raw"] + ".ttout", "r")
        for line in open(text["raw"], "r"):
            type, word, id, attrib = line.split('\t')
            id = id.split()
            record = Record(type, word, id)
            record.attrib = eval(attrib)
            if type == "word":
                tag_l = tag_fh.readline()
                next_word, tag = tag_l.split("\t")[0:2]
                pos, lem, prob = tag.split(" ")
                if next_word != word.decode('utf-8',
                                            'ignore').lower().encode('utf-8'):
                    print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem
                    return
                else:
                    record.attrib["pos"] = pos
                    record.attrib["lemma"] = lem
                    print >> tmp_fh, record
            else:
                print >> tmp_fh, record
        os.remove(text["raw"])
        os.rename(text["raw"] + ".tmp", text["raw"])
        os.remove(text["raw"] + ".ttout")
def fix_pages(loader_obj,text,depth=4):
    """Unfinished, do not use"""
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth]
    current_page = 0;
    temp_file = open(text["sortedtoms"] + ".tmp","w")
    for line in open(text["sortedtoms"]):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib) 
Example #13
0
    def tag_words(loader_obj, text):
        # Set up the treetagger process
        tt_args = [tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file]
        ttout_fh = open(text["raw"] + ".ttout", "w")
        tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh)
        raw_fh = open(text["raw"], "r")
        line_count = 0

        # read through the object file, pass the words to treetagger
        for line in raw_fh:
            type, word, id, attrib = line.split('\t')
            id = id.split()
            if type == "word":
                word = word.decode('utf-8', 'ignore').lower().encode('utf-8')
                # close and re-open the treetagger process to prevent garbage
                # output.
                if line_count > maxlines:
                    tt_worker.stdin.close()
                    tt_worker.wait()
                    new_ttout_fh = open(text["raw"] + ".ttout", "a")
                    tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh)
                    line_count = 0
                print >> tt_worker.stdin, word
                line_count += 1

        # finish tagging
        tt_worker.stdin.close()
        tt_worker.wait()

        # go back through the object file, and add the treetagger results to
        # each word
        tmp_fh = open(text["raw"] + ".tmp", "w")
        tag_fh = open(text["raw"] + ".ttout", "r")
        for line in open(text["raw"], "r"):
            type, word, id, attrib = line.split('\t')
            id = id.split()
            record = Record(type, word, id)
            record.attrib = loads(attrib)
            if type == "word":
                tag_l = tag_fh.readline()
                next_word, tag = tag_l.split("\t")[0:2]
                pos, lem, prob = tag.split(" ")
                if next_word != word.decode('utf-8', 'ignore').lower().encode('utf-8'):
                    print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem
                    return
                else:
                    record.attrib["pos"] = pos
                    record.attrib["lemma"] = lem
                    print >> tmp_fh, record
            else:
                print >> tmp_fh, record
        os.remove(text["raw"])
        os.rename(text["raw"] + ".tmp", text["raw"])
        os.remove(text["raw"] + ".ttout")
Example #14
0
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp","w")
    for line in open(text["raw"]):
        rec_type, word, id, attrib = line.split('\t')
        id = id.split()
        if rec_type == "word":
            word = word.decode("utf-8").lower().encode("utf-8")
        record = Record(rec_type, word, id)
        record.attrib = eval(attrib)
        print >> tmp_file, record
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp",text["raw"])
Example #15
0
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp", "w")
    for line in open(text["raw"]):
        rec_type, word, id, attrib = line.split('\t')
        id = id.split()
        if rec_type == "word":
            word = word.decode("utf-8").lower().encode("utf-8")
        record = Record(rec_type, word, id)
        record.attrib = eval(attrib)
        print >> tmp_file, record
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
Example #16
0
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp", "w")
    with open(text["raw"]) as filehandle:
        for line in filehandle:
            rec_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            if rec_type == "word":
                word = word.lower()
            record = Record(rec_type, word, philo_id)
            record.attrib = loads(attrib)
            print(record, file=tmp_file)
    tmp_file.close()
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
Example #17
0
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp", "w")
    with open(text["raw"]) as filehandle:
        for line in filehandle:
            rec_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            if rec_type == "word":
                word = word.lower()
            record = Record(rec_type, word, philo_id)
            record.attrib = loads(attrib)
            print(record, file=tmp_file)
    tmp_file.close()
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
Example #18
0
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp", "w")
    with open(text["raw"]) as fh:
        for line in fh:
            rec_type, word, id, attrib = line.split('\t')
            id = id.split()
            if rec_type == "word":
                word = word.decode("utf-8").lower().encode("utf-8")
            record = Record(rec_type, word, id)
            record.attrib = loads(attrib)
            print(record, file=tmp_file)
    tmp_file.close()
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
def normalize_unicode_raw_words(loader_obj, text):
    tmp_file = open(text["raw"] + ".tmp", "w")
    with open(text["raw"]) as fh:
        for line in fh:
            rec_type, word, id, attrib = line.split('\t')
            id = id.split()
            if rec_type == "word":
                word = word.decode("utf-8").lower().encode("utf-8")
            record = Record(rec_type, word, id)
            record.attrib = loads(attrib)
            print(record, file=tmp_file)
    tmp_file.close()
    os.remove(text["raw"])
    os.rename(text["raw"] + ".tmp", text["raw"])
Example #20
0
    def inner_prev_next_obj(loader_obj, text):
        """Store the previous and next object for every object passed to this function
        By default, this is doc, div1, div2, div3."""
        record_dict = {}
        temp_file = text["raw"] + ".tmp"
        output_file = open(temp_file, "w")
        attrib_set = set()
        with open(text["sortedtoms"]) as filehandle:
            for line in filehandle:
                philo_type, word, philo_id, attrib = line.split("\t")
                philo_id = philo_id.split()
                record = Record(philo_type, word, philo_id)
                record.attrib = loads(attrib)
                if philo_type in record_dict:
                    record_dict[philo_type].attrib["next"] = " ".join(philo_id)
                    if philo_type in philo_types:
                        print(record_dict[philo_type], file=output_file)
                    else:
                        del record_dict[philo_type].attrib["next"]
                        del record_dict[philo_type].attrib["prev"]
                        print(record_dict[philo_type], file=output_file)
                    record.attrib["prev"] = " ".join(
                        record_dict[philo_type].id)
                    record_dict[philo_type] = record
                else:
                    record.attrib["prev"] = ""
                    record_dict[philo_type] = record
                attrib_set.update(record.attrib.keys())

        philo_types.reverse()
        for obj in philo_types:
            try:
                record_dict[obj].attrib["next"] = ""
                print(record_dict[obj], file=output_file)
            except KeyError:
                pass
        output_file.close()
        os.remove(text["sortedtoms"])
        philo_type_pattern = "|".join("^%s" % t for t in loader_obj.types)
        tomscommand = 'cat %s | egrep "%s" | LANG=C sort %s > %s' % (
            temp_file,
            philo_type_pattern,
            loader_obj.sort_by_id,
            text["sortedtoms"],
        )
        os.system(tomscommand)
        os.remove(temp_file)
        return attrib_set
Example #21
0
 def inner_make_object_ancestors(loader_obj, text):
     temp_file = text['words'] + '.tmp'
     output_file = open(temp_file, 'w')
     for line in open(text['words']):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib)
         for type in types:
             zeros_to_add = ['0' for i in range(7 - type_depth[type])]
             philo_id = id[:type_depth[type]] + zeros_to_add
             record.attrib[type + '_ancestor'] = ' '.join(philo_id)
         print >> output_file, record
     output_file.close()
     os.remove(text['words'])
     os.rename(temp_file, text['words'])
Example #22
0
 def inner_make_object_ancestors(loader_obj, text):
     temp_file = text['words'] + '.tmp'
     output_file = open(temp_file, 'w')
     for line in open(text['words']):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib)
         for type in types:
             zeros_to_add = ['0' for i in range(7 - type_depth[type])]
             philo_id = id[:type_depth[type]] + zeros_to_add
             record.attrib[type + '_ancestor'] = ' '.join(philo_id)
         print >> output_file, record
     output_file.close()
     os.remove(text['words'])
     os.rename(temp_file, text['words'])
Example #23
0
 def inner_make_object_ancestors(loader_obj, text):
     temp_file = text['words'] + '.tmp'
     output_file = open(temp_file, 'w')
     with open(text['words']) as filehandle:
         for line in filehandle:
             philo_type, word, philo_id, attrib = line.split('\t')
             philo_id = philo_id.split()
             record = Record(philo_type, word, philo_id)
             record.attrib = loads(attrib)
             for philo_type in philo_types:
                 zeros_to_add = ['0' for i in range(7 - philo_type_depth[philo_type])]
                 philo_id = philo_id[:philo_type_depth[philo_type]] + zeros_to_add
                 record.attrib[philo_type + '_ancestor'] = ' '.join(philo_id)
             print(record, file=output_file)
     output_file.close()
     os.remove(text['words'])
     os.rename(temp_file, text['words'])
Example #24
0
 def smash_these_unicode_columns(loader_obj, text):
     tmp_file = open(text["sortedtoms"] + ".tmp", "w")
     for line in open(text["sortedtoms"]):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = loads(attrib)
         for column in columns:
             if column in record.attrib:
                 #                    print >> sys.stderr, repr(record.attrib)
                 col = record.attrib[column].decode("utf-8")
                 col = col.lower()
                 smashed_col = [c for c in unicodedata.normalize("NFKD", col) if not unicodedata.combining(c)]
                 record.attrib[column + "_norm"] = ''.join(smashed_col).encode("utf-8")
         print(record, file=tmp_file)
     tmp_file.close()
     os.remove(text["sortedtoms"])
     os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
def make_word_counts(loader_obj, text, depth=4):
    object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"]
    counts = [0 for i in range(depth)]
    temp_file = text["raw"] + ".tmp"
    output_file = open(temp_file, "w")
    for line in open(text["raw"]):
        type, word, id, attrib = line.split("\t")
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        for d, count in enumerate(counts):
            if type == "word":
                counts[d] += 1
            elif type == object_types[d]:
                record.attrib["word_count"] = counts[d]
                counts[d] = 0
        print >> output_file, record
    output_file.close()
    os.remove(text["raw"])
    os.rename(temp_file, text["raw"])
Example #26
0
def make_word_counts(loader_obj, text, depth=5):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word']
    counts = [0 for i in range(depth)]
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['raw']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        for d,count in enumerate(counts):
            if type == 'word':
                counts[d] += 1
            elif type == object_types[d]:
                record.attrib['word_count'] = counts[d]
                counts[d] = 0
        print >> output_file, record
    output_file.close()
    os.remove(text['raw'])
    os.rename(temp_file, text['raw'])
Example #27
0
def make_word_counts(loader_obj, text, depth=4):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word']
    counts = [0 for i in range(depth)]
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['raw']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        for d, count in enumerate(counts):
            if type == 'word':
                counts[d] += 1
            elif type == object_types[d]:
                record.attrib['word_count'] = counts[d]
                counts[d] = 0
        print >> output_file, record
    output_file.close()
    os.remove(text['raw'])
    os.rename(temp_file, text['raw'])
Example #28
0
 def inner_prev_next_obj(loader_obj, text):
     """Store the previous and next object for every object passed to this function
     By default, this is doc, div1, div2, div3."""
     record_dict = {}
     temp_file = text['raw'] + '.tmp'
     output_file = open(temp_file, 'w')
     with open(text['sortedtoms']) as filehandle:
         for line in filehandle:
             philo_type, word, philo_id, attrib = line.split('\t')
             philo_id = philo_id.split()
             record = Record(philo_type, word, philo_id)
             record.attrib = loads(attrib)
             if philo_type in record_dict:
                 record_dict[philo_type].attrib['next'] = ' '.join(philo_id)
                 if philo_type in philo_types:
                     print(record_dict[philo_type], file=output_file)
                 else:
                     del record_dict[philo_type].attrib['next']
                     del record_dict[philo_type].attrib['prev']
                     print(record_dict[philo_type], file=output_file)
                 record.attrib['prev'] = ' '.join(
                     record_dict[philo_type].id)
                 record_dict[philo_type] = record
             else:
                 record.attrib['prev'] = ''
                 record_dict[philo_type] = record
     philo_types.reverse()
     for obj in philo_types:
         try:
             record_dict[obj].attrib['next'] = ''
             print(record_dict[obj], file=output_file)
         except KeyError:
             pass
     output_file.close()
     os.remove(text['sortedtoms'])
     philo_type_pattern = "|".join("^%s" % t for t in loader_obj.types)
     tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (
         temp_file, philo_type_pattern, loader_obj.sort_by_id,
         text["sortedtoms"])
     os.system(tomscommand)
     os.remove(temp_file)
Example #29
0
def make_word_counts(loader_obj, text, depth=5):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word']
    counts = [0 for i in range(depth)]
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    with open(text['raw']) as filehandle:
        for line in filehandle:
            philo_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            record = Record(philo_type, word, philo_id)
            record.attrib = loads(attrib)
            for d, count in enumerate(counts):
                if philo_type == 'word':
                    counts[d] += 1
                elif philo_type == object_types[d]:
                    record.attrib['word_count'] = counts[d]
                    counts[d] = 0
            print(record, file=output_file)
    output_file.close()
    os.remove(text['raw'])
    os.rename(temp_file, text['raw'])
Example #30
0
 def inner_make_object_ancestors(loader_obj, text):
     temp_file = text["words"] + ".tmp"
     output_file = open(temp_file, "w")
     with open(text["words"]) as filehandle:
         for line in filehandle:
             philo_type, word, philo_id, attrib = line.split("\t")
             philo_id = philo_id.split()
             record = Record(philo_type, word, philo_id)
             record.attrib = loads(attrib)
             for philo_type in philo_types:
                 zeros_to_add = [
                     "0" for i in range(7 - philo_type_depth[philo_type])
                 ]
                 philo_id = philo_id[:philo_type_depth[
                     philo_type]] + zeros_to_add
                 record.attrib[philo_type +
                               "_ancestor"] = " ".join(philo_id)
             print(record, file=output_file)
     output_file.close()
     os.remove(text["words"])
     os.rename(temp_file, text["words"])
Example #31
0
 def smash_these_unicode_columns(loader_obj, text):
     tmp_file = open(text["sortedtoms"] + ".tmp", "w")
     for line in open(text["sortedtoms"]):
         philo_type, word, philo_id, attrib = line.split('\t')
         philo_id = philo_id.split()
         record = Record(philo_type, word, philo_id)
         record.attrib = loads(attrib)
         for column in columns:
             if column in record.attrib:
                 #                    print >> sys.stderr, repr(record.attrib)
                 col = record.attrib[column]
                 col = col.lower()
                 smashed_col = [
                     c for c in unicodedata.normalize("NFKD", col)
                     if not unicodedata.combining(c)
                 ]
                 record.attrib[column + "_norm"] = ''.join(smashed_col)
         print(record, file=tmp_file)
     tmp_file.close()
     os.remove(text["sortedtoms"])
     os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
Example #32
0
 def inner_make_object_ancestors(loader_obj, text):
     temp_file = text['words'] + '.tmp'
     output_file = open(temp_file, 'w')
     with open(text['words']) as filehandle:
         for line in filehandle:
             philo_type, word, philo_id, attrib = line.split('\t')
             philo_id = philo_id.split()
             record = Record(philo_type, word, philo_id)
             record.attrib = loads(attrib)
             for philo_type in philo_types:
                 zeros_to_add = [
                     '0' for i in range(7 - philo_type_depth[philo_type])
                 ]
                 philo_id = philo_id[:philo_type_depth[
                     philo_type]] + zeros_to_add
                 record.attrib[philo_type +
                               '_ancestor'] = ' '.join(philo_id)
             print(record, file=output_file)
     output_file.close()
     os.remove(text['words'])
     os.rename(temp_file, text['words'])
Example #33
0
def make_word_counts(loader_obj, text, depth=5):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word']
    counts = [0 for i in range(depth)]
    temp_file = text['raw'] + '.tmp'
    output_file = open(temp_file, 'w')
    with open(text['raw']) as filehandle:
        for line in filehandle:
            philo_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            record = Record(philo_type, word, philo_id)
            record.attrib = loads(attrib)
            for d, count in enumerate(counts):
                if philo_type == 'word':
                    counts[d] += 1
                elif philo_type == object_types[d]:
                    record.attrib['word_count'] = counts[d]
                    counts[d] = 0
            print(record, file=output_file)
    output_file.close()
    os.remove(text['raw'])
    os.rename(temp_file, text['raw'])
Example #34
0
 def inner_prev_next_obj(loader_obj, text):
     """Store the previous and next object for every object passed to this function
     By default, this is doc, div1, div2, div3."""
     record_dict = {}
     temp_file = text['raw'] + '.tmp'
     output_file = open(temp_file, 'w')
     with open(text['sortedtoms']) as fh:
         for line in fh:
             type, word, id, attrib = line.split('\t')
             id = id.split()
             record = Record(type, word, id)
             record.attrib = loads(attrib)
             if type in record_dict:
                 record_dict[type].attrib['next'] = ' '.join(id)
                 if type in types:
                     print(record_dict[type], file=output_file)
                 else:
                     del record_dict[type].attrib['next']
                     del record_dict[type].attrib['prev']
                     print(record_dict[type], file=output_file)
                 record.attrib['prev'] = ' '.join(record_dict[type].id)
                 record_dict[type] = record
             else:
                 record.attrib['prev'] = ''
                 record_dict[type] = record
     types.reverse()
     for obj in types:
         try:
             record_dict[obj].attrib['next'] = ''
             print(record_dict[obj], file=output_file)
         except KeyError:
             pass
     output_file.close()
     os.remove(text['sortedtoms'])
     type_pattern = "|".join("^%s" % t for t in loader_obj.types)
     tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (temp_file, type_pattern, loader_obj.sort_by_id,
                                                             text["sortedtoms"])
     os.system(tomscommand)
     os.remove(temp_file)
Example #35
0
def word_frequencies_per_obj(loader_obj, text, depth=1):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent',
                    'word'][:depth]
    files_path = loader_obj.destination + '/WORK/'
    try:
        os.mkdir(files_path)
    except OSError:
        ## Path was already created
        pass
    for d, obj in enumerate(object_types):
        file = text['name'] + '.%s.sorted' % obj
        output = open(files_path + file, 'w')
        d = d + 1
        old_philo_id = []
        records = {}
        for line in open(text['words']):
            type, word, id, attrib = line.split('\t')
            attrib = eval(attrib)
            philo_id = id.split()
            record = Record(type, word, philo_id)
            count_key = obj + '_token_count'
            byte = attrib['byte_start']
            del attrib['byte_start']
            record.attrib = {count_key: attrib[count_key]}
            if philo_id[:d] != old_philo_id[:d]:
                if records:
                    for w in records:
                        print >> output, records[w]
                        records = {}
            if word not in records:
                record.attrib['bytes'] = []
                record.attrib['bytes'] = str(byte)
                records[word] = record
            else:
                records[word].attrib['bytes'] += ' ' + str(byte)
            old_philo_id = philo_id
        for w in records:
            print >> output, records[w]
        output.close()
Example #36
0
def make_token_counts(loader_obj, text, depth=5):
    old_word = None
    record_list = []
    temp_file = text['words'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['words']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        if word == old_word or old_word == None:
            record_list.append(record)
        else:
            count_tokens(record_list, depth, output_file)
            record_list = []
            record_list.append(record)
        old_word = word
    if len(record_list) != 0:
        count_tokens(record_list, depth, output_file)
    record_list = []
    os.remove(text['words'])
    os.rename(temp_file, text['words'])
Example #37
0
 def inner_word_frequencies_per_obj(loader_obj,text):
     files_path = loader_obj.destination + '/WORK/'
     try:
         os.mkdir(files_path)
     except OSError:
         ## Path was already created                                                                                                                                       
         pass
     for obj, d in obj_types.iteritems():
         file = text['name'] + '.%s.freq_counts' % obj
         output = open(files_path + file, 'w')
         old_philo_id = []
         old_word = ''
         records = {}
         for line in open(text['words']):
             type, word, id, attrib = line.split('\t')
             attrib = eval(attrib)
             ## Dismiss all irrelevant fields while making sure we still have 9 fields in the end
             philo_id = id.split()[:d] + [0 for i in range(7-d)] + [0,0]
             record = Record(type, word, philo_id)
             count_key = obj + '_token_count'
             byte = attrib['byte_start']
             del attrib['byte_start']
             record.attrib = {'token_count': attrib[count_key]}
             if philo_id[:d] != old_philo_id[:d] or word != old_word:
                 if records and old_word:
                     for w in records:
                         print >> output, records[w]
                         records = {}
             if word not in records:
                 record.attrib['bytes'] = []
                 record.attrib['bytes']= str(byte)
                 records[word] = record
             else:
                 records[word].attrib['bytes'] += ' ' + str(byte)
             old_philo_id = philo_id
             old_word = word
         for w in records:
             print >> output, records[w]
         output.close()
Example #38
0
def make_token_counts(loader_obj, text, depth=5):
    old_word = None
    record_list = []
    temp_file = text['words'] + '.tmp'
    output_file = open(temp_file, 'w')
    for line in open(text['words']):
        type, word, id, attrib = line.split('\t')
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        if word == old_word or old_word == None:
            record_list.append(record)
        else:
            count_tokens(record_list, depth, output_file)
            record_list = []
            record_list.append(record)
        old_word = word
    if len(record_list) != 0:
        count_tokens(record_list, depth, output_file)
    record_list = []
    os.remove(text['words'])
    os.rename(temp_file, text['words'])
def prev_next_obj(loader_obj, text, depth=4):
    object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"][:depth]
    record_dict = {}
    temp_file = text["raw"] + ".tmp"
    output_file = open(temp_file, "w")
    for line in open(text["sortedtoms"]):
        type, word, id, attrib = line.split("\t")
        id = id.split()
        record = Record(type, word, id)
        record.attrib = eval(attrib)
        if type in record_dict:
            record_dict[type].attrib["next"] = " ".join(id)
            if type in object_types:
                print >> output_file, record_dict[type]
            else:
                del record_dict[type].attrib["next"]
                del record_dict[type].attrib["prev"]
                print >> output_file, record_dict[type]
            record.attrib["prev"] = " ".join(record_dict[type].id)
            record_dict[type] = record
        else:
            record.attrib["prev"] = ""
            record_dict[type] = record
    object_types.reverse()
    for obj in object_types:
        record_dict[obj].attrib["next"] = ""
        print >> output_file, record_dict[obj]
    output_file.close()
    os.remove(text["sortedtoms"])
    type_pattern = "|".join("^%s" % t for t in loader_obj.types)
    tomscommand = 'cat %s | egrep "%s" | sort %s > %s' % (
        temp_file,
        type_pattern,
        loader_obj.sort_by_id,
        text["sortedtoms"],
    )
    os.system(tomscommand)
    os.remove(temp_file)
Example #40
0
def word_frequencies_per_obj(loader_obj, text, depth=1):
    object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth]
    files_path = loader_obj.destination + '/WORK/'
    try:
        os.mkdir(files_path)
    except OSError:
        ## Path was already created                                                                                                                                       
        pass
    for d, obj in enumerate(object_types):
        file = text['name'] + '.%s.sorted' % obj
        output = open(files_path + file, 'w')
        d = d + 1
        old_philo_id = []
        records = {}
        for line in open(text['words']):
            type, word, id, attrib = line.split('\t')
            attrib = eval(attrib)
            philo_id = id.split()
            record = Record(type, word, philo_id)
            count_key = obj + '_token_count'
            byte = attrib['byte_start']
            del attrib['byte_start']
            record.attrib = {count_key: attrib[count_key]}
            if philo_id[:d] != old_philo_id[:d]:
                if records:
                    for w in records:
                        print >> output, records[w]
                        records = {}
            if word not in records:
                record.attrib['bytes'] = []
                record.attrib['bytes']= str(byte)
                records[word] = record
            else:
                records[word].attrib['bytes'] += ' ' + str(byte)
            old_philo_id = philo_id
        for w in records:
            print >> output, records[w]
        output.close()
Example #41
0
 def smash_these_unicode_columns(loader_obj, text):
     tmp_file = open(text["sortedtoms"] + ".tmp", "w")
     for line in open(text["sortedtoms"]):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib)
         for column in columns:
             if column in record.attrib:
                 #                    print >> sys.stderr, repr(record.attrib)
                 col = record.attrib[column].decode("utf-8")
                 col = col.lower()
                 smashed_col = [
                     c for c in unicodedata.normalize("NFKD", col)
                     if not unicodedata.combining(c)
                 ]
                 record.attrib[column + "_norm"] = ''.join(
                     smashed_col).encode("utf-8")
                 #record.attrib[column + "_norm"] = ''.join([c.encode("utf-8") for c in unicodedata.normalize('NFKD',record.attrib[column].decode("utf-8").lower()) if not unicodedata.combining(c)])
         print >> tmp_file, record
     tmp_file.close()
     os.remove(text["sortedtoms"])
     os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
Example #42
0
 def inner_prev_next_obj(loader_obj, text):
     record_dict = {}
     temp_file = text['raw'] + '.tmp'
     output_file = open(temp_file, 'w')
     for line in open(text['sortedtoms']):
         type, word, id, attrib = line.split('\t')
         id = id.split()
         record = Record(type, word, id)
         record.attrib = eval(attrib)
         if type in record_dict:
             record_dict[type].attrib['next'] = ' '.join(id)
             if type in types:
                 print >> output_file, record_dict[type]
             else:
                 del record_dict[type].attrib['next']
                 del record_dict[type].attrib['prev']
                 print >> output_file, record_dict[type]
             record.attrib['prev'] = ' '.join(record_dict[type].id)
             record_dict[type] = record
         else:
             record.attrib['prev'] = ''
             record_dict[type] = record
     types.reverse()
     for obj in types:
         try:
             record_dict[obj].attrib['next'] = ''
             print >> output_file, record_dict[obj]
         except KeyError:
             pass
     output_file.close()
     os.remove(text['sortedtoms'])
     type_pattern = "|".join("^%s" % t for t in loader_obj.types)
     tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (
         temp_file, type_pattern, loader_obj.sort_by_id, text["sortedtoms"])
     os.system(tomscommand)
     os.remove(temp_file)
def return_record(line):
    rec_type, word, id, attrib = line.split('\t')
    id = id.split()
    record = Record(rec_type, word, id)
    record.attrib = eval(attrib)
    return rec_type, word.decode('utf-8'), record
Example #44
0
    def tag_words(loader_obj, text):
        # Set up the treetagger process
        tt_args = [
            tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold",
            ".01", param_file
        ]
        ttout_filehandle = open(text["raw"] + ".ttout", "w")
        tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_filehandle)
        raw_filehandle = open(text["raw"], "r")
        line_count = 0

        # read through the object file, pass the words to treetagger
        for line in raw_filehandle:
            philo_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            if philo_type == "word":
                word = word.lower()
                # close and re-open the treetagger process to prevent garbage
                # output.
                if line_count > maxlines:
                    tt_worker.stdin.close()
                    tt_worker.wait()
                    new_ttout_filehandle = open(text["raw"] + ".ttout", "a")
                    tt_worker = Popen(tt_args,
                                      stdin=PIPE,
                                      stdout=new_ttout_filehandle)
                    line_count = 0
                print(word, file=tt_worker.stdin)
                line_count += 1

        # finish tagging
        tt_worker.stdin.close()
        tt_worker.wait()

        # go back through the object file, and add the treetagger results to
        # each word
        tmp_filehandle = open(text["raw"] + ".tmp", "w")
        tag_filehandle = open(text["raw"] + ".ttout", "r")
        for line in open(text["raw"], "r"):
            philo_type, word, philo_id, attrib = line.split('\t')
            philo_id = philo_id.split()
            record = Record(philo_type, word, philo_id)
            record.attrib = loads(attrib)
            if philo_type == "word":
                tag_l = tag_filehandle.readline()
                next_word, tag = tag_l.split("\t")[0:2]
                pos, lem, prob = tag.split(" ")
                if next_word != word.lower():
                    print("TREETAGGER ERROR:",
                          next_word,
                          " != ",
                          word,
                          pos,
                          lem,
                          file=sys.stderr)
                    return
                else:
                    record.attrib["pos"] = pos
                    record.attrib["lemma"] = lem
                    print(record, file=tmp_filehandle)
            else:
                print(record, file=tmp_filehandle)
        os.remove(text["raw"])
        os.rename(text["raw"] + ".tmp", text["raw"])
        os.remove(text["raw"] + ".ttout")
Example #45
0
def return_record(line):
    rec_type, word, id, attrib = line.split('\t')
    id = id.split()
    record = Record(rec_type, word, id)
    record.attrib = eval(attrib)
    return rec_type, word.decode('utf-8'), record