def generate_csv(in_dir, out): """\ Walks through the `in_dir` and generates the CSV file `out` """ writer = UnicodeWriter(open(out, "wb"), delimiter=";") writer.writerow(("Reference ID", "Created", "Origin", "Subject")) for cable in cables_from_source(in_dir): writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_csv(src, out): """\ Walks through `src` and generates the CSV file `out` """ writer = UnicodeWriter(open(out, 'wb'), delimiter=';') writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject')) for cable in cables_from_source(src, predicate=pred.origin_filter(pred.origin_germany)): writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_csv(in_dir, out): """\ Walks through the `in_dir` and generates the CSV file `out` """ writer = UnicodeWriter(open(out, 'wb'), delimiter=';') writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject')) for cable in cables_from_source(in_dir): writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_text_files(in_dir, out_dir, include_header=False): """\ Walks through the `in_dir` and generates text versions of the cables in the `out_dir`. """ for cable in cables_from_source(in_dir): out = codecs.open(out_dir + '/' + cable.reference_id + '.txt', 'wb', encoding='utf-8') out.write(cable_to_text(cable, include_header)) out.close()
def run_update(src, predicate=None): acronyms = set(_ACRONYMS) subjects = set() tags = defaultdict(list) for cable in cables_from_source(src, predicate): update_acronyms(cable, acronyms) update_missing_subjects(cable, subjects) update_tags(cable, tags) return {'acronyms': acronyms, 'subjects': subjects, 'tags': tags}
def generate_csv(src, out): """\ Walks through `src` and generates the CSV file `out` """ writer = UnicodeWriter(open(out, 'wb'), delimiter=';') writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject')) for cable in cables_from_source(src, predicate=pred.origin_filter( pred.origin_germany)): writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_csv(path, out): """\ Walks through the `path` and generates the CSV file `out` """ def is_berlin_cable(filename): return 'BERLIN' in filename writer = UnicodeWriter(open(out, 'wb'), delimiter=';') writer.writerow(('Reference ID', 'Created', 'Origin', 'Subject')) for cable in cables_from_source(path, predicate=is_berlin_cable): writer.writerow((cable.reference_id, cable.created, cable.origin, titlefy(cable.subject)))
def generate_csv(path, out): """\ Walks through the `path` and generates the CSV file `out` """ known_persons = () with codecs.open(os.path.join(os.path.dirname(__file__), 'person_names.txt'), 'rb', 'utf-8') as f: known_persons = set((l.rstrip() for l in f)) writer = UnicodeWriter(open(out, 'wb'), delimiter=';') for cable in cables_from_source(path): content = cable.content_body if not content: continue persons = [person for person in known_persons if person in content] if persons: row = [cable.reference_id] row.extend(persons) writer.writerow(row)
def cable_doc_gen(): """ Função geradora que itera sobre cables.csv retornando um telegrama por vez, incluindo-o em um dicionário compatível com o elasticsearch. """ for j, cable in enumerate(cables_from_source(fname)): doc = build_doc(cable) action = { "_index": "wikileaks", "_type": "telegramas", "_id": j, "doc": doc } if j == 1000: break if j % 1000 == 0: print("Indexando telegrama número {}".format(j)) yield action
def generate_csv(path, out): """\ Walks through the `path` and generates the CSV file `out` """ known_persons = () with codecs.open( os.path.join(os.path.dirname(__file__), 'person_names.txt'), 'rb', 'utf-8') as f: known_persons = set((l.rstrip() for l in f)) writer = UnicodeWriter(open(out, 'wb'), delimiter=';') for cable in cables_from_source(path): content = cable.content_body if not content: continue persons = [person for person in known_persons if person in content] if persons: row = [cable.reference_id] row.extend(persons) writer.writerow(row)
def telegrams(): df = pd.DataFrame(columns=['index', 'lista'], ) for j, cable in enumerate(cables_from_source(fname)): print("Gerando telegrama {}".format(j), end='\r') content = getattr(cable, 'content') content = content[content.find("1. "):len(content) - 1].lower() content = strip_short(content, minsize=3) content = strip_punctuation(content) content = strip_non_alphanum(content) content = remove_stopwords(content) content = lemmatization(content, ['NOUN']) df = df.append( { 'lista': content, 'index': j }, ignore_index=True, ) return df
import datetime import os from cablemap.core import cables_from_source from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.analysis import CharsetFilter, StemmingAnalyzer from whoosh.writing import AsyncWriter from whoosh.qparser import QueryParser from whoosh import fields, sorting from whoosh.support.charset import accent_map fname = "/home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks-cables/cables.csv" i = 0 for cable in cables_from_source(fname): print(cable.subject, cable.created) if i > 5: break i += 1 atributos = [i for i in dir(cable) if not i.startswith('_')] atributos.pop(3) #remove "classification_categories" atributos.pop(4) #remove "comment" print(atributos) def build_doc(cable): doc = {} for a in atributos: try: