Esempio n. 1
0
def processing(letter):
    count = 0
    errors = open(os.getcwd() + '/' + letter + '_errors.txt', "w")
    articles = listdir(paths.pathCorpus+'/'+letter)
    print letter
    for name_article in articles:
        patharticle = paths.pathCorpus + '/'+letter+'/'+name_article+"/article"
        pathentites = paths.pathCorpus +'/'+letter+'/'+name_article + "/res.json"
        try:
            json_of_article = marking.read_json(pathentites)
            article = marking.read_article(patharticle)
            first_mistake = len(article)
            for entity in json_of_article:
                if len(entity) == 0:
                    continue
                for pair in entity['Boundaries']:
                    if article[pair[0]:pair[0]+2].lower() != entity['Entity'][0:2]:
                        if first_mistake > pair[0] and pair[0] != 0:
                            first_mistake = pair[0]
            if first_mistake != len(article):
                line = name_article + " " + str(first_mistake) + " " + str(len(article)) + '\n'
                errors.write(line)
                count += 1
        except Exception:
            continue
    errors.write("\n\n\n" + str(count))
    errors.close()
def checkArticle(article, path):
    try:
        if re.search("[*?\|<>/^:]", article) is not None:
            return False
        json_of_article = marking.read_json(path + os.sep + "res.json")
        filtered_words = [
            word for word in json_of_article if (len(word) > 0 and word["Entity"] not in stopwords.words("english"))
        ]
        if len(filtered_words) < 7:
            return False
        out_file = open(path + os.sep + "res.json", "w")
        json.dump(filtered_words, out_file)
        print out_file
        print out_file
        out_file.close()
        return True
    except Exception:
        return False
Esempio n. 3
0
import marking
import os
import sys
import argparse
from os import listdir

parser = argparse.ArgumentParser()
parser.add_argument('--pathCorpus', default=os.getcwd() + os.sep + 'Corpus')
parser.add_argument('--pathHTMLs', default=os.getcwd() + os.sep + 'NERCorpus')
paths = parser.parse_args(sys.argv[1:])
if os.path.exists(paths.pathHTMLs) == False:
    os.mkdir(paths.pathHTMLs)
allfiles = listdir(paths.pathCorpus)
for q1 in allfiles:
    q2 = listdir(paths.pathCorpus + os.sep + q1)
    print q1
    if os.path.exists(paths.pathHTMLs + os.sep + q1) == False:
        os.mkdir(paths.pathHTMLs + os.sep + q1)
    for q3 in q2:
        path = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "article"
        pathout = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "res.json"
        try:
            json_of_article = marking.read_json(pathout)
            article = marking.read_article(path)
            entities = marking.get_entities(json_of_article)
            marking.make_html(article, entities, paths.pathHTMLs + os.sep + q1 + os.sep + q3 + ".html")
        except Exception:
            continue