def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index+1)/len(inputfiles)) normalized_record = normalize(record) out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) normalized_record = normalize(record) out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def store_wikidata_author_id(line, author_id, gnd, response): row = line.split(";") common.write_json_file( WIKIDATA_AUTHOR_DIR, str(row[ONB_COL]) + common.UNDERSCORE + gnd + common.UNDERSCORE + str(author_id) + common.JSON_EXT, response)
def store_compositions_data(composition_id, response): filename = str(composition_id).replace(FREEBASE_ID_PREFIX,'') + common.JSON_EXT response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DATA_DIR + common.SLASH + filename) if(response_json == None): #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DATA_DIR + SLASH + filename) #if not inputfile: print 'composition not exists for ID:', composition_id common.write_json_file(FREEBASE_COMPOSITIONS_DATA_DIR, filename, response)
def store_compositions(author_id, response): filename = str(author_id).replace(FREEBASE_ID_PREFIX, '') + common.JSON_EXT response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DIR + common.SLASH + filename) if (response_json == None): #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DIR + SLASH + filename) #if not inputfile: print 'composition not exists for author:', author_id common.write_json_file(FREEBASE_COMPOSITIONS_DIR, filename, response)
def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) normalized_record = normalize(record) print 'normalized record: ', normalized_record #new_normalized_record = [unicode(elem).encode('utf-8') for elem in normalized_record] #print 'new normalized record: ', new_normalized_record out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def enrich_records(inputfiles, outputdir, force=False): print("Enriching", len(inputfiles), "records. Saving to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index+1)/len(inputfiles)) out_file = os.path.basename(filename).replace(".json", "_enriched.json") out_path = outputdir + "/" + out_file if(os.path.exists(out_path) and not force): print(out_file, "already enriched. Skipping...") else: enriched_record = enrich(record) write_json_file(outputdir, out_file, enriched_record)
def enrich_records(inputfiles, outputdir, force=False): print("Enriching", len(inputfiles), "records. Saving to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) out_file = os.path.basename(filename).replace(".json", "_enriched.json") out_path = outputdir + "/" + out_file if (os.path.exists(out_path) and not force): print(out_file, "already enriched. Skipping...") else: enriched_record = enrich(record) write_json_file(outputdir, out_file, enriched_record)
def main(): # parse args args = docopt(__doc__.format(self_filename=Path(__file__).name)) # Parse config file with open(args['<config_file>'] or DEFAULT_CONFIG_FILE) as f: config_yaml = yaml.load(f.read(), Loader=yaml.BaseLoader) config = config_model.Config(**config_yaml) # Init logging log = logging.getLogger(__name__) logging.basicConfig( format="%(levelname)s: %(message)s", level=(logging.DEBUG if args['--debug'] else logging.INFO)) # Parse already_seen file if not SEEN_FILEPATH.exists(): log.error( f'init "{SEEN_FILEPATH}" file before use. ("echo \'[]\' > {SEEN_FILEPATH})"' ) sys.exit(-1) already_seen_set = set() if SEEN_FILEPATH.stat().st_size != 0: with open(SEEN_FILEPATH) as f: already_seen_set = set(ensure_list(json.load(f))) new_ids_set = links = None try: _, new_ids_set, links = scrapper.scrap( config, log, already_seen_set=already_seen_set, send_sms=not args["--no-sms"]) except: if not args["--test"] and not args["--no-sms"]: scrapper.send_sms("EXCEPTION", config) raise finally: # Write new found ids to seen file if new_ids_set: print(f'-> update {SEEN_FILEPATH!r}') write_json_file(SEEN_FILEPATH, list(already_seen_set | new_ids_set)) if args['--clipboard'] and links: # Copy urls links to clipboard import clipboard try: clipboard.copy("\n".join(links)) except Exception as e: log.error(f"Error while copying to clipboard:\n{e}") traceback.print_tb(e.__traceback__) else: log.info("URLs copied to clipboard")
def create_word2vec_model(): mecab = MeCab.Tagger('-Owakati') nouns = [] articles = {} # text from NHK NEWS WEB urls = [ 'http://www3.nhk.or.jp/rss/news/cat' + str(i) + '.xml' for i in range(8) ] nhk = '' for url in urls: html = urllib.request.urlopen(url) soup = BeautifulSoup(html, 'html.parser') items = soup.find_all('item') for i in items: item_soup = BeautifulSoup(str(i), 'html.parser') text = remove_tag(item_soup.title, 'title') + ' ' + remove_tag( item_soup.description, 'description') nhk += text articles[remove_tag(item_soup.title, 'title')] = { 'url': remove_tag(item_soup.link, 'link'), 'nouns': list(set(extract_nouns(text))) } nhk_nouns = extract_nouns(nhk) write_json_file(list(set(nhk_nouns)), 'nhk-nouns') write_json_file(articles, 'nhk-articles') nouns.extend(nhk_nouns) # text from twitter user time-line tweets = api.user_timeline(count=200) twitter = '' for tweet in tweets: tweet = process_tweet(tweet) if tweet: twitter += tweet + '\n' twitter_nouns = extract_nouns(twitter) nouns.extend(twitter_nouns) write_json_file(list(set(twitter_nouns)), 'twitter-nouns') write_json_file(list(set(nouns)), 'nouns') write_file(mecab.parse(nhk + twitter), 'wakati.txt') data = word2vec.Text8Corpus('wakati.txt') model = word2vec.Word2Vec(data, size=200, min_count=0) return model
def store_wikidata_composition_data(composition_id, response): common.write_json_file(WIKIDATA_COMPOSITION_DATA_DIR, str(composition_id) + common.JSON_EXT, response)
def store_musicbrainz_composition_data(composition_id, response, dir): common.write_json_file(dir, str(composition_id) + common.JSON_EXT, response)
def store_wikidata_band_data(band_id, response): common.write_json_file(WIKIDATA_BAND_DATA_DIR, str(band_id) + common.JSON_EXT, response)
def store_wikidata_author_data(author_id, response): common.write_json_file(WIKIDATA_AUTHOR_DATA_DIR, str(author_id) + common.JSON_EXT, response)
def store_wikidata_property(property_id, response): common.write_json_file(WIKIDATA_PROPERTY_DIR, str(property_id) + common.JSON_EXT, response)
def store_wikidata_author_id(line, author_id, gnd, response): row = line.split(";") common.write_json_file(WIKIDATA_AUTHOR_DIR, str(row[ONB_COL]) + common.UNDERSCORE + gnd + common.UNDERSCORE + str(author_id) + common.JSON_EXT, response)