def FileOper(etl, data, type): path = etl.FilePath; filetype = path.split('.')[-1].lower(); encode = 'utf-8' if etl.EncodingType == 'UTF8' else 'ascii'; if filetype in ['csv', 'txt']: import csv file = open(etl.FilePath, type, encoding=encode); sp = ',' if filetype == 'csv' else '\t'; if type == 'r': reader = csv.DictReader(file, delimiter=sp) for r in reader: yield r; else: writer = csv.DictWriter(file, delimiter=sp) start = False; for r in data: if not start: field = r.keys; writer.fieldnames = field; writer.writerow(dict(zip(field, field))) start = True; writer.writer(r) yield r; file.close(); elif filetype == 'xlsx': pass; elif filetype == 'xml' and type == 'r': tree = ET.parse(path); root = tree.getroot(); root = root.findall('Doc'); for etool in root: p = {r: etool.attrib[r] for r in etool.attrib}; yield p; elif filetype == 'xml' and type == 'w': pass; elif filetype == 'json': if type == 'r': items = json.load(open(path, encoding=encode)); for r in items: yield r; else: json.open(path); for r in data: json.write(r) yield r; json.close() json.dump([r for r in data], open(path, type, encode));
def fetch(self, word, language=None, old_id=None, cache_dir='/data/rsg/nlp/j_luo/wiki/wiktionary'): language = self.language if not language else language path = cache_dir / 'htmls' / get_path(word) / f'{word}.html' json_path = cache_dir / 'jsons' / get_path(word) / f'{word}.json' try: # Use cached json if it exists. with json_path.open(mode='r', encoding='utf8') as fin: return json.open(fin) except FileNotFoundError: pass try: # Use cached html if possible. with path.open(mode='r', encoding='utf8') as fin: response_text = fin.read() except FileNotFoundError: response = self.session.get(self.url.format(word), params={'oldid': old_id}) response_text = response.text path.parent.mkdir(parents=True, exist_ok=True) with path.open(mode='w', encoding='utf8') as fout: fout.write(response_text) self.soup = BeautifulSoup(response_text.replace('>\n<', '><'), 'html.parser') self.current_word = word self.clean_html() try: ret = self.get_word_data(language.lower()) with json_path.open(mode='w', encoding='utf8') as fout: json.dump(ret, fout) return ret except: print(word) raise Exception
#!/usr/bin/env python import time as t import json lp = True if lp: from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() if __name__ == '__main__': f = '/home/brian/Desktop/Code/Messenger/facebook-briandigiorgio/messages/anniezanger_2ca6e61030/message.html' with json.open(f) as fi: data = json.load(fi) messages = data['messages'] messages = messages.replace('\n', '') messages = messages.replace(''', "'") messages = messages.replace('"', '"') userstr = '<span class="user">' datestr = '</span><span class="meta">' textstr = '</span></div></div><p>' endstr = '</p>' videostr = '<span style="float:right">Duration: ' photostr = '<img src="' header = 'mtype,user,week,month,day,year,hour,minute,ampm,time,text' if lp: header += ',comp,neg,neu,pos' out = open('messages.csv', 'w+')
import requests from bottle import Bottle, response, request as bottle_request import json from api test = json.open(api) class BotHandlerMixin: BOT_URL = None def get_chat_id(self, data): """ Method to extract chat id from telegram request. """ chat_id = data['message']['chat']['id'] return chat_id def get_message(self, data): """ Method to extract message id from telegram request. """ message_text = data['message']['text'] return message_text def send_message(self, prepared_data): """ Prepared data should be json which includes at least `chat_id` and `text` """ message_url = self.BOT_URL + 'sendMessage'
import json file = json.open('json.json').open() data = json.loads(file) for item in data: print item
def load_json(path): """Returns dictionary from json path""" with open(path, "r") as w: adict = json.open(w) return adict
def write_json(self): json_f = json.open(self.parsed.json_file, 'w+') json.dump(self.json_fields, json_f) json_f.close()
from gensim.utils import simple_preprocess from operator import itemgetter import sklearn from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances from sklearn.manifold import TSNE from sklearn.manifold.t_sne import (_joint_probabilities, _kl_divergence) from sklearn.utils.extmath import _ravel from sklearn.cluster import KMeans from sklearn.preprocessing import LabelEncoder, MinMaxScaler with open('../raw_data/train.json') as json_file: train_file = json.load(json_file) with open('../raw_data/test.json') as json_file: val_file = json.open("../raw_data/test.json") save_path = "../write_data/" # Using some stopwords from https://github.com/AlludedCrabb/sound-tasty cooking_stop_words = list( set([ 'canned', 'cans', 'drained', 'and', 'halved', 'cup', 'cups', 'teaspoon', 'tablespoon',
import gym import numpy as np import matplotlib.pyplot as plt import os import json env = gym.make("MountainCar-v0") data = {} with open('hiperparemters.josn', 'r') as file: data = json.open(file) # Hiperparemeters LEARNING_RATE = data['learning_rate'] # how important we find future actions value between (0,1) DISCOUNT = data['discount'] EPISODES = data['episodes'] # Exploration settings epsilon = data['epsilon'] START_EPSILON_DECAYING = data['start_epsilon_decaying'] END_EPSILON_DECAYING = data['end_epsilon_decaying'] epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING) SHOW_EVERY = 500 save_qtable = True # Discretization # make continous values more discrete, split them into bins
def get_image_prov(j, gcis_url, dump_dir): """Generate PROV-ES JSON from GCIS image metadata.""" # create doc doc = ProvEsDocument() bndl = None # create image, figure, chapter and report entities img_id = GCIS["%s" % j['uri'][1:].replace('/', '-')] img_title = j['title'] img_url = None img_thumbnail_url = None # # # #Get Files?? for file_md in j.get('files', []): img_url = file_md['href'] img_thumbnail_url = file_md['thumbnail_href'] img_attrs = [ ( PROV_TYPE, GCIS['Image'] ), ( PROV_LABEL, img_title ), ] if img_url is None: img_attrs.append(( PROV_LOCATION, "%s%s" % (gcis_url, j['uri']) )) else: img_attrs.append(( PROV_LOCATION, img_url )) if img_thumbnail_url is None: img_attrs.append(( HYSDS['thumbnail'], img_thumbnail_url )) doc.entity(img_id, img_attrs) reports = [] chapters = [] findings = [] figures = [] #GET FIGURES??? for figure in j.get('figures', []): report_uri = "%s/report/report_%s.json" %(dump_dir, figure['report_identifier']) chapter_uri = "%s/chapter/%s/report_%s_chapter_%s.json" % (dump_dir, figure['report_identifier'], figure['report_identifier'], figure['chapter_identifier']) figure_uri = "%s/figure/figure_%s.json" % (dump_dir, figure['identifier']) # create report #r = requests.get('%s%s.json' % (gcis_url, report_uri)) #r.raise_for_status() #report = r.json() with open(report_uri) as reportJson: reportJson = json.open(report_uri) report_id = GCIS["%s" % report_uri[1:].replace('/', '-')] if report_id not in reports: doc.entity(report_id, [ ( PROV_TYPE, GCIS['Report'] ), ( PROV_LABEL, report['title'] ), ( PROV_LOCATION, report['url'] ), ]) reports.append(report_id) # create chapter r = requests.get('%s%s%s.json' % (gcis_url, report_uri, chapter_uri)) if r.status_code != 200: print("Failed with %d code: %s" % (r.status_code, r.content)) continue r.raise_for_status() chapter = r.json() chapter_id = GCIS["%s" % chapter_uri[1:].replace('/', '-')] if chapter_id not in chapters: doc.entity(chapter_id, [ ( PROV_TYPE, GCIS['Chapter'] ), ( PROV_LABEL, chapter['title'] ), ( PROV_LOCATION, chapter['url'] ), ]) chapters.append(chapter_id) doc.hadMember(report_id, chapter_id) # create findings r = requests.get('%s%s%s/finding.json' % (gcis_url, report_uri, chapter_uri)) r.raise_for_status() for f in r.json(): finding_id = GCIS["%s" % f['identifier']] if finding_id not in findings: doc.entity(finding_id, [ ( PROV_TYPE, GCIS['Finding'] ), ( PROV_LABEL, f['identifier'] ), ( PROV_LOCATION, f['href'] ), ]) findings.append(finding_id) doc.hadMember(report_id, finding_id) doc.hadMember(chapter_id, finding_id) # create figure r = requests.get('%s%s%s%s.json' % (gcis_url, report_uri, chapter_uri, figure_uri)) r.raise_for_status() figure_md = r.json() figure_id = GCIS["%s" % figure_uri[1:].replace('/', '-')] if figure_id not in figures: doc.entity(figure_id, [ ( PROV_TYPE, GCIS['Figure'] ), ( PROV_LABEL, figure_md['title'] ), ( PROV_LOCATION, "%s%s" % (gcis_url, figure_md['uri']) ), ]) figures.append(figure_id) doc.hadMember(chapter_id, figure_id) doc.hadMember(figure_id, img_id) # create agents or organizations agent_ids = {} org_ids = {} for cont in j.get('contributors', []): # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')] # create person if len(cont['person']) > 0: # agent agent_name = " ".join([cont['person'][i] for i in ('first_name', 'middle_name', 'last_name') if cont['person'].get(i, None) is not None]) doc.agent(agent_id, [ ( PROV_TYPE, GCIS["Person"] ), ( PROV_LABEL, agent_name ), ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ), ]) agent_ids[agent_id] = [] # organization if len(cont['organization']) > 0: org = cont['organization'] org_id = GCIS["%s" % cont['organization']['identifier']] if org_id not in org_ids: doc.governingOrganization(org_id, cont['organization']['name']) org_ids[org_id] = True if agent_id in agent_ids: agent_ids[agent_id].append(org_id) # create activity start_time = j['create_dt'] end_time = j['create_dt'] for parent in j.get('parents', []): input_id = GCIS["%s" % parent['url'][1:].replace('/', '-')] input_name = parent['label'] doc.entity(input_id, [ ( PROV_TYPE, GCIS["Dataset"] ), ( PROV_LABEL, input_name ), ( PROV_LOCATION, "%s%s" % (gcis_url, parent['url']) ), ]) # some activity uri's are null if parent['activity_uri'] is None: act_id = GCIS["derive-from-%s" % input_id] else: act_id = GCIS["%s" % parent['activity_uri'][1:].replace('/', '-')] attrs = [] for agent_id in agent_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))] doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Contributor']}) for org_id in agent_ids[agent_id]: del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))] doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': GCIS['worksAt']}) for org_id in org_ids: waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))] doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Funder']}) act = doc.activity(act_id, start_time, end_time, attrs) doc.used(act, input_id, start_time, GCIS["%s" % get_uuid("%s:%s" % (act_id, input_id))]) doc.wasGeneratedBy(img_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (img_id, act_id))]) # serialize prov_json = json.loads(doc.serialize()) # for hadMember relations, add prov:type for hm_id in prov_json.get('hadMember', {}): hm = prov_json['hadMember'][hm_id] col = hm['prov:collection'] ent = hm['prov:entity'] if col in reports and ent in chapters: hm['prov:type'] = GCIS['hasChapter'] elif col in chapters and ent in figures: hm['prov:type'] = GCIS['hasFigure'] elif col in figures and ent == img_id: hm['prov:type'] = GCIS['hasImage'] return prov_json