Beispiel #1
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index+1)/len(inputfiles))
        normalized_record = normalize(record)
        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Beispiel #2
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        normalized_record = normalize(record)
        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Beispiel #3
0
def store_wikidata_author_id(line, author_id, gnd, response):

    row = line.split(";")
    common.write_json_file(
        WIKIDATA_AUTHOR_DIR,
        str(row[ONB_COL]) + common.UNDERSCORE + gnd + common.UNDERSCORE +
        str(author_id) + common.JSON_EXT, response)
def store_compositions_data(composition_id, response):

    filename = str(composition_id).replace(FREEBASE_ID_PREFIX,'') + common.JSON_EXT
    response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DATA_DIR + common.SLASH + filename)
    if(response_json == None):
    #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DATA_DIR + SLASH + filename)
    #if not inputfile:
        print 'composition not exists for ID:', composition_id
        common.write_json_file(FREEBASE_COMPOSITIONS_DATA_DIR, filename, response)
Beispiel #5
0
def store_compositions(author_id, response):

    filename = str(author_id).replace(FREEBASE_ID_PREFIX, '') + common.JSON_EXT
    response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DIR +
                                                  common.SLASH + filename)
    if (response_json == None):
        #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DIR + SLASH + filename)
        #if not inputfile:
        print 'composition not exists for author:', author_id
        common.write_json_file(FREEBASE_COMPOSITIONS_DIR, filename, response)
Beispiel #6
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        normalized_record = normalize(record)
        print 'normalized record: ', normalized_record
        #new_normalized_record = [unicode(elem).encode('utf-8') for elem in normalized_record]
        #print 'new normalized record: ', new_normalized_record

        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Beispiel #7
0
def enrich_records(inputfiles, outputdir, force=False):
    print("Enriching", len(inputfiles), "records. Saving to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index+1)/len(inputfiles))
        out_file = os.path.basename(filename).replace(".json",
                                                      "_enriched.json")
        out_path = outputdir + "/" + out_file
        if(os.path.exists(out_path) and not force):
            print(out_file, "already enriched. Skipping...")
        else:
            enriched_record = enrich(record)
            write_json_file(outputdir, out_file, enriched_record)
Beispiel #8
0
def enrich_records(inputfiles, outputdir, force=False):
    print("Enriching", len(inputfiles), "records. Saving to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        out_file = os.path.basename(filename).replace(".json",
                                                      "_enriched.json")
        out_path = outputdir + "/" + out_file
        if (os.path.exists(out_path) and not force):
            print(out_file, "already enriched. Skipping...")
        else:
            enriched_record = enrich(record)
            write_json_file(outputdir, out_file, enriched_record)
def main():
    # parse args
    args = docopt(__doc__.format(self_filename=Path(__file__).name))

    # Parse config file
    with open(args['<config_file>'] or DEFAULT_CONFIG_FILE) as f:
        config_yaml = yaml.load(f.read(), Loader=yaml.BaseLoader)
        config = config_model.Config(**config_yaml)

    # Init logging
    log = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(levelname)s: %(message)s",
        level=(logging.DEBUG if args['--debug'] else logging.INFO))

    # Parse already_seen file
    if not SEEN_FILEPATH.exists():
        log.error(
            f'init "{SEEN_FILEPATH}" file before use. ("echo \'[]\' > {SEEN_FILEPATH})"'
        )
        sys.exit(-1)
    already_seen_set = set()
    if SEEN_FILEPATH.stat().st_size != 0:
        with open(SEEN_FILEPATH) as f:
            already_seen_set = set(ensure_list(json.load(f)))

    new_ids_set = links = None
    try:
        _, new_ids_set, links = scrapper.scrap(
            config,
            log,
            already_seen_set=already_seen_set,
            send_sms=not args["--no-sms"])
    except:
        if not args["--test"] and not args["--no-sms"]:
            scrapper.send_sms("EXCEPTION", config)
        raise
    finally:
        # Write new found ids to seen file
        if new_ids_set:
            print(f'-> update {SEEN_FILEPATH!r}')
            write_json_file(SEEN_FILEPATH,
                            list(already_seen_set | new_ids_set))
        if args['--clipboard'] and links:
            # Copy urls links to clipboard
            import clipboard
            try:
                clipboard.copy("\n".join(links))
            except Exception as e:
                log.error(f"Error while copying to clipboard:\n{e}")
                traceback.print_tb(e.__traceback__)
            else:
                log.info("URLs copied to clipboard")
Beispiel #10
0
def create_word2vec_model():
    mecab = MeCab.Tagger('-Owakati')
    nouns = []
    articles = {}

    # text from NHK NEWS WEB
    urls = [
        'http://www3.nhk.or.jp/rss/news/cat' + str(i) + '.xml'
        for i in range(8)
    ]
    nhk = ''
    for url in urls:
        html = urllib.request.urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')

        items = soup.find_all('item')
        for i in items:
            item_soup = BeautifulSoup(str(i), 'html.parser')
            text = remove_tag(item_soup.title, 'title') + ' ' + remove_tag(
                item_soup.description, 'description')
            nhk += text
            articles[remove_tag(item_soup.title, 'title')] = {
                'url': remove_tag(item_soup.link, 'link'),
                'nouns': list(set(extract_nouns(text)))
            }

    nhk_nouns = extract_nouns(nhk)
    write_json_file(list(set(nhk_nouns)), 'nhk-nouns')
    write_json_file(articles, 'nhk-articles')
    nouns.extend(nhk_nouns)

    # text from twitter user time-line
    tweets = api.user_timeline(count=200)
    twitter = ''
    for tweet in tweets:
        tweet = process_tweet(tweet)
        if tweet:
            twitter += tweet + '\n'
    twitter_nouns = extract_nouns(twitter)
    nouns.extend(twitter_nouns)
    write_json_file(list(set(twitter_nouns)), 'twitter-nouns')

    write_json_file(list(set(nouns)), 'nouns')
    write_file(mecab.parse(nhk + twitter), 'wakati.txt')

    data = word2vec.Text8Corpus('wakati.txt')
    model = word2vec.Word2Vec(data, size=200, min_count=0)

    return model
Beispiel #11
0
def store_wikidata_composition_data(composition_id, response):

    common.write_json_file(WIKIDATA_COMPOSITION_DATA_DIR,
                           str(composition_id) + common.JSON_EXT, response)
def store_musicbrainz_composition_data(composition_id, response, dir):

    common.write_json_file(dir, str(composition_id) + common.JSON_EXT, response)
Beispiel #13
0
def store_wikidata_band_data(band_id, response):

    common.write_json_file(WIKIDATA_BAND_DATA_DIR,
                           str(band_id) + common.JSON_EXT, response)
def store_wikidata_author_data(author_id, response):

    common.write_json_file(WIKIDATA_AUTHOR_DATA_DIR, str(author_id) + common.JSON_EXT, response)
def store_wikidata_property(property_id, response):

    common.write_json_file(WIKIDATA_PROPERTY_DIR, str(property_id) + common.JSON_EXT, response)
def store_wikidata_author_id(line, author_id, gnd, response):

    row = line.split(";")
    common.write_json_file(WIKIDATA_AUTHOR_DIR, str(row[ONB_COL]) + common.UNDERSCORE + gnd + common.UNDERSCORE
                           + str(author_id) + common.JSON_EXT, response)
Beispiel #17
0
def store_wikidata_author_data(author_id, response):

    common.write_json_file(WIKIDATA_AUTHOR_DATA_DIR,
                           str(author_id) + common.JSON_EXT, response)
Beispiel #18
0
def store_wikidata_property(property_id, response):

    common.write_json_file(WIKIDATA_PROPERTY_DIR,
                           str(property_id) + common.JSON_EXT, response)