Example #1
0
def generate_csv(in_dir, out):
    """\
    Walks through the `in_dir` and generates the CSV file `out`
    """
    writer = UnicodeWriter(open(out, "wb"), delimiter=";")
    writer.writerow(("Reference ID", "Created", "Origin", "Subject"))
    for cable in cables_from_source(in_dir):
        writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_csv(src, out):
    """\
    Walks through `src` and generates the CSV file `out`
    """
    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject'))
    for cable in cables_from_source(src, predicate=pred.origin_filter(pred.origin_germany)):
        writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
Example #3
0
def generate_csv(in_dir, out):
    """\
    Walks through the `in_dir` and generates the CSV file `out`
    """
    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject'))
    for cable in cables_from_source(in_dir):
        writer.writerow((cable.reference_id, cable.created, cable.origin,
                         titlefy(cable.subject)))
Example #4
0
def generate_text_files(in_dir, out_dir, include_header=False):
    """\
    Walks through the `in_dir` and generates text versions of
    the cables in the `out_dir`.
    """
    for cable in cables_from_source(in_dir):
        out = codecs.open(out_dir + '/' + cable.reference_id + '.txt', 'wb', encoding='utf-8')
        out.write(cable_to_text(cable, include_header))
        out.close()
Example #5
0
def run_update(src, predicate=None):
    acronyms = set(_ACRONYMS)
    subjects = set()
    tags = defaultdict(list)
    for cable in cables_from_source(src, predicate):
        update_acronyms(cable, acronyms)
        update_missing_subjects(cable, subjects)
        update_tags(cable, tags)
    return {'acronyms': acronyms, 'subjects': subjects, 'tags': tags}
def generate_csv(src, out):
    """\
    Walks through `src` and generates the CSV file `out`
    """
    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject'))
    for cable in cables_from_source(src,
                                    predicate=pred.origin_filter(
                                        pred.origin_germany)):
        writer.writerow((cable.reference_id, cable.created, cable.origin,
                         titlefy(cable.subject)))
Example #7
0
def generate_csv(path, out):
    """\
    Walks through the `path` and generates the CSV file `out`
    """
    def is_berlin_cable(filename):
        return 'BERLIN' in filename

    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject'))
    for cable in cables_from_source(path, predicate=is_berlin_cable):
        writer.writerow((cable.reference_id, cable.created, cable.origin,
                         titlefy(cable.subject)))
Example #8
0
def generate_csv(path, out):
    """\
    Walks through the `path` and generates the CSV file `out`
    """
    known_persons = ()
    with codecs.open(os.path.join(os.path.dirname(__file__), 'person_names.txt'), 'rb', 'utf-8') as f:
        known_persons = set((l.rstrip() for l in f))
    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    for cable in cables_from_source(path):
        content = cable.content_body
        if not content:
            continue
        persons = [person for person in known_persons if person in content]
        if persons:
            row = [cable.reference_id]
            row.extend(persons)
            writer.writerow(row)
Example #9
0
def cable_doc_gen():
    """
    Função geradora que itera sobre cables.csv
    retornando um telegrama por vez, incluindo-o em um dicionário compatível com o elasticsearch.
    """
    for j, cable in enumerate(cables_from_source(fname)):
        doc = build_doc(cable)
        action = {
            "_index": "wikileaks",
            "_type": "telegramas",
            "_id": j,
            "doc": doc
        }
        if j == 1000: break
        if j % 1000 == 0:
            print("Indexando telegrama número {}".format(j))

        yield action
Example #10
0
def generate_csv(path, out):
    """\
    Walks through the `path` and generates the CSV file `out`
    """
    known_persons = ()
    with codecs.open(
            os.path.join(os.path.dirname(__file__), 'person_names.txt'), 'rb',
            'utf-8') as f:
        known_persons = set((l.rstrip() for l in f))
    writer = UnicodeWriter(open(out, 'wb'), delimiter=';')
    for cable in cables_from_source(path):
        content = cable.content_body
        if not content:
            continue
        persons = [person for person in known_persons if person in content]
        if persons:
            row = [cable.reference_id]
            row.extend(persons)
            writer.writerow(row)
Example #11
0
def telegrams():
    df = pd.DataFrame(columns=['index', 'lista'], )
    for j, cable in enumerate(cables_from_source(fname)):
        print("Gerando telegrama {}".format(j), end='\r')
        content = getattr(cable, 'content')
        content = content[content.find("1. "):len(content) - 1].lower()
        content = strip_short(content, minsize=3)
        content = strip_punctuation(content)
        content = strip_non_alphanum(content)
        content = remove_stopwords(content)
        content = lemmatization(content, ['NOUN'])
        df = df.append(
            {
                'lista': content,
                'index': j
            },
            ignore_index=True,
        )
    return df
Example #12
0
import datetime
import os
from cablemap.core import cables_from_source

from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh.writing import AsyncWriter
from whoosh.qparser import QueryParser
from whoosh import fields, sorting
from whoosh.support.charset import accent_map

fname = "/home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks-cables/cables.csv"

i = 0
for cable in cables_from_source(fname):
    print(cable.subject, cable.created)
    if i > 5:
        break
    i += 1

atributos = [i for i in dir(cable) if not i.startswith('_')]
atributos.pop(3)  #remove "classification_categories"
atributos.pop(4)  #remove "comment"
print(atributos)


def build_doc(cable):
    doc = {}
    for a in atributos:
        try: