Ejemplo n.º 1
0
def getting_data():
    files_names = get_files()
    assuntos = {}
    for file_name in files_names:
        print(file_name)
        with codecs.open(file_name, "r", "utf-8") as handle:
            text = handle.read()
        x = json.loads(
            text, object_hook=lambda d: create_process(d.keys(), d.values()))
        for p in x:
            #print(p.assunto)
            assunto = p.assunto.strip()
            if assunto in assuntos:
                assuntos[assunto]['valor'] += 1
                assuntos[assunto]['list_docs'].append(p.abstract)
                assuntos[assunto]['list_target'].append(assunto)
            else:
                assuntos[assunto] = {}
                assuntos[assunto]['valor'] = 1
                assuntos[assunto]['list_docs'] = [p.abstract]
                assuntos[assunto]['list_target'] = [assunto]
    l_docs = []
    l_target = []
    i = 0
    cut = 500
    for k, v in sorted(assuntos.items(), key=lambda x: x[0][0], reverse=True):
        if v['valor'] > cut:
            i += 1
            l_docs = l_docs + v['list_docs'][0:cut]
            l_target = l_target + v['list_target'][0:cut]
    print(i)
    print(len(l_docs), len(l_target))
    return l_docs, l_target
Ejemplo n.º 2
0
def get_processes(file_name):
    dir_ = os.path.dirname(os.path.abspath(__file__))
    processes = []
    print(file_name)
    with codecs.open(os.path.join(dir_, file_name), "r", "utf-8") as handle:
        text = handle.read()
        x = json.loads(
            text, object_hook=lambda d: create_process(d.keys(), d.values()))
        for p in x:
            processes.append(p)
    return processes
Ejemplo n.º 3
0
def count_data():
    dir_ = os.path.dirname(os.path.abspath(__file__))
    files_names = get_files()
    total = 0
    for file_name in files_names:
        print(file_name)
        with codecs.open(os.path.join(dir_, file_name), "r",
                         "utf-8") as handle:
            text = handle.read()
        x = json.loads(
            text, object_hook=lambda d: create_process(d.keys(), d.values()))
        total = total + len(x)
    return total
Ejemplo n.º 4
0
def getting_data_all(cut=300, attr='assunto', del_keys=True):
    files_names = get_files()
    agrouped = {}
    for file_name in files_names:
        print(file_name)
        with codecs.open(file_name, "r", "utf-8") as handle:
            try:
                processes = json.loads(
                    handle.read(),
                    object_hook=lambda d: create_process(d.keys(), d.values()))
            except:
                print('file name cannot be read: ', file_name)
                continue
            for p in processes:
                if hasattr(p, attr):
                    group = getattr(p, attr)
                    if group in agrouped:
                        agrouped[group]['valor'] += 1
                        agrouped[group]['list_docs'].append(p.abstract)
                        agrouped[group]['list_target'].append(group)
                    else:
                        agrouped[group] = {}
                        agrouped[group]['valor'] = 1
                        agrouped[group]['list_docs'] = [p.abstract]
                        agrouped[group]['list_target'] = [group]
    l_class = []
    keys_to_delete = []
    for k, v in agrouped.items():
        if 'valor' not in v:
            print('nao tem valor', k)
            keys_to_delete.append(k)
        elif len(k) < 3:
            keys_to_delete.append(k)
        else:
            if v['valor'] >= cut:
                l_class.append(k)
            else:
                keys_to_delete.append(k)

    if del_keys:
        for key in keys_to_delete:
            del agrouped[key]
    return l_class, agrouped
Ejemplo n.º 5
0
def getting_data_subject(attr='assunto'):
    files_names = get_files()
    agrouped = {}
    for file_name in files_names:
        print(file_name)
        with codecs.open(file_name, "r", "utf-8") as handle:
            try:
                processes = json.loads(
                    handle.read(),
                    object_hook=lambda d: create_process(d.keys(), d.values()))
            except:
                print('file name cannot be read: ', file_name)
                continue
            for p in processes:
                if hasattr(p, attr):
                    group = getattr(p, attr)
                    if group in agrouped:
                        agrouped[group]['valor'] += 1
                        agrouped[group]['list_docs'].append(p.abstract)
                        agrouped[group]['list_target'].append(group)
                    else:
                        agrouped[group] = {}
                        agrouped[group]['valor'] = 1
                        agrouped[group]['list_docs'] = [p.abstract]
                        agrouped[group]['list_target'] = [group]
    keys_to_delete = []
    for k, v in agrouped.items():
        try:
            if v is None:
                print(k)
                keys_to_delete.append(k)
            if 'valor' not in v:
                print(k)
                keys_to_delete.append(k)
        except:
            keys_to_delete.append(k)
    for key in keys_to_delete:
        del agrouped[key]
    return agrouped
Ejemplo n.º 6
0
def jdefault(o):
    return o.__dict__


if __name__ == "__main__":
    dir_ = os.path.dirname(os.path.abspath(__file__))
    files_names = os.listdir(dir_)
    files_names = [f for f in files_names if f.endswith(".json")]
    l_docs = []
    l_files = []
    for file_name in files_names:
        with codecs.open(os.path.join(dir_, file_name), "r",
                         "utf-8") as handle:
            text = handle.read()
        x = json.loads(
            text, object_hook=lambda d: create_process(d.keys(), d.values()))
        for p in x:
            #print(p.assunto)
            if p.assunto == " Cheque":
                l_docs.append(p.abstract)
                l_files.append(p.npu_process)

    print(len(l_docs), len(l_files))
    result = tdm(l_docs, l_files)
    print(result.shape)
    word_count = result.sum(1)
    total_words = word_count.sum(0)
    word_count2 = word_count / total_words
    wc = WordCloud(background_color="white", max_words=2000)
    wc.generate_from_frequencies(word_count2.to_dict())
    wc.to_file(os.path.join(dir_, "teste.jpg"))