コード例 #1
0
ファイル: etl.py プロジェクト: zangree/etlpy
def FileOper(etl, data, type):
    path = etl.FilePath;
    filetype = path.split('.')[-1].lower();
    encode = 'utf-8' if etl.EncodingType == 'UTF8'  else 'ascii';
    if filetype in ['csv', 'txt']:
        import csv
        file = open(etl.FilePath, type, encoding=encode);
        sp = ',' if filetype == 'csv' else '\t';
        if type == 'r':
            reader = csv.DictReader(file, delimiter=sp)
            for r in reader:
                yield r;
        else:
            writer = csv.DictWriter(file, delimiter=sp)
            start = False;
            for r in data:
                if not start:
                    field = r.keys;
                    writer.fieldnames = field;
                    writer.writerow(dict(zip(field, field)))
                    start = True;
                writer.writer(r)
                yield r;
        file.close();
    elif filetype == 'xlsx':
        pass;
    elif filetype == 'xml' and type == 'r':
        tree = ET.parse(path);
        root = tree.getroot();
        root = root.findall('Doc');
        for etool in root:
            p = {r: etool.attrib[r] for r in etool.attrib};
            yield p;
    elif filetype == 'xml' and type == 'w':
        pass;
    elif filetype == 'json':
        if type == 'r':
            items = json.load(open(path, encoding=encode));
            for r in items:
                yield r;
        else:
            json.open(path);
            for r in data:
                json.write(r)
                yield r;
            json.close()
            json.dump([r for r in data], open(path, type, encode));
コード例 #2
0
    def fetch(self,
              word,
              language=None,
              old_id=None,
              cache_dir='/data/rsg/nlp/j_luo/wiki/wiktionary'):
        language = self.language if not language else language
        path = cache_dir / 'htmls' / get_path(word) / f'{word}.html'
        json_path = cache_dir / 'jsons' / get_path(word) / f'{word}.json'
        try:  # Use cached json if it exists.
            with json_path.open(mode='r', encoding='utf8') as fin:
                return json.open(fin)
        except FileNotFoundError:
            pass

        try:  # Use cached html if possible.
            with path.open(mode='r', encoding='utf8') as fin:
                response_text = fin.read()
        except FileNotFoundError:
            response = self.session.get(self.url.format(word),
                                        params={'oldid': old_id})
            response_text = response.text
            path.parent.mkdir(parents=True, exist_ok=True)
            with path.open(mode='w', encoding='utf8') as fout:
                fout.write(response_text)

        self.soup = BeautifulSoup(response_text.replace('>\n<', '><'),
                                  'html.parser')
        self.current_word = word
        self.clean_html()
        try:
            ret = self.get_word_data(language.lower())
            with json_path.open(mode='w', encoding='utf8') as fout:
                json.dump(ret, fout)
            return ret
        except:
            print(word)
            raise Exception
コード例 #3
0
#!/usr/bin/env python

import time as t
import json

lp = True
if lp:
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()

if __name__ == '__main__':
    f = '/home/brian/Desktop/Code/Messenger/facebook-briandigiorgio/messages/anniezanger_2ca6e61030/message.html'
    with json.open(f) as fi:
        data = json.load(fi)
    messages = data['messages']
    messages = messages.replace('\n', '')
    messages = messages.replace('&#039;', "'")
    messages = messages.replace('&quot;', '"')

    userstr = '<span class="user">'
    datestr = '</span><span class="meta">'
    textstr = '</span></div></div><p>'
    endstr = '</p>'
    videostr = '<span style="float:right">Duration: '
    photostr = '<img src="'

    header = 'mtype,user,week,month,day,year,hour,minute,ampm,time,text'
    if lp:
        header += ',comp,neg,neu,pos'

    out = open('messages.csv', 'w+')
コード例 #4
0
import requests  
from bottle import Bottle, response, request as bottle_request
import json
from api

test = json.open(api)

class BotHandlerMixin:  
    BOT_URL = None

    def get_chat_id(self, data):
        """
        Method to extract chat id from telegram request.
        """
        chat_id = data['message']['chat']['id']

        return chat_id

    def get_message(self, data):
        """
        Method to extract message id from telegram request.
        """
        message_text = data['message']['text']

        return message_text

    def send_message(self, prepared_data):
        """
        Prepared data should be json which includes at least `chat_id` and `text`
        """       
        message_url = self.BOT_URL + 'sendMessage'
コード例 #5
0
import json

file = json.open('json.json').open()
data = json.loads(file)
for item in data:
    print item
コード例 #6
0
def load_json(path):
    """Returns dictionary from json path"""
    with open(path, "r") as w:
        adict = json.open(w)
    return adict
コード例 #7
0
 def write_json(self):
     json_f = json.open(self.parsed.json_file, 'w+')
     json.dump(self.json_fields, json_f)
     json_f.close()
コード例 #8
0
from gensim.utils import simple_preprocess
from operator import itemgetter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.manifold import TSNE
from sklearn.manifold.t_sne import (_joint_probabilities, _kl_divergence)
from sklearn.utils.extmath import _ravel
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

with open('../raw_data/train.json') as json_file:
    train_file = json.load(json_file)

with open('../raw_data/test.json') as json_file:
    val_file = json.open("../raw_data/test.json")

save_path = "../write_data/"

# Using some stopwords from https://github.com/AlludedCrabb/sound-tasty
cooking_stop_words = list(
    set([
        'canned',
        'cans',
        'drained',
        'and',
        'halved',
        'cup',
        'cups',
        'teaspoon',
        'tablespoon',
コード例 #9
0
ファイル: main.py プロジェクト: tynski/ReinforcementLearning
import gym
import numpy as np
import matplotlib.pyplot as plt
import os
import json

env = gym.make("MountainCar-v0")

data = {}

with open('hiperparemters.josn', 'r') as file:
    data = json.open(file)

# Hiperparemeters
LEARNING_RATE = data['learning_rate']
# how important we find future actions value between (0,1)
DISCOUNT = data['discount']
EPISODES = data['episodes']

# Exploration settings
epsilon = data['epsilon']
START_EPSILON_DECAYING = data['start_epsilon_decaying']

END_EPSILON_DECAYING = data['end_epsilon_decaying']
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

SHOW_EVERY = 500
save_qtable = True

# Discretization
# make continous values more discrete, split them into bins
コード例 #10
0
def get_image_prov(j, gcis_url, dump_dir):
    """Generate PROV-ES JSON from GCIS image metadata."""

    # create doc
    doc = ProvEsDocument()
    bndl = None

    # create image, figure, chapter and report entities
    img_id = GCIS["%s" % j['uri'][1:].replace('/', '-')]
    img_title = j['title']
    img_url = None
    img_thumbnail_url = None
    
#
#
#
#Get Files??
    
    for file_md in j.get('files', []):
        img_url = file_md['href']
        img_thumbnail_url = file_md['thumbnail_href']
    img_attrs = [
        ( PROV_TYPE, GCIS['Image'] ),
        ( PROV_LABEL, img_title ),
    ]
    if img_url is None:
        img_attrs.append(( PROV_LOCATION, "%s%s" % (gcis_url, j['uri']) ))
    else:
        img_attrs.append(( PROV_LOCATION, img_url ))
    if img_thumbnail_url is None:
        img_attrs.append(( HYSDS['thumbnail'], img_thumbnail_url ))
    doc.entity(img_id, img_attrs)
    reports = []
    chapters = []
    findings = []
    figures = []
    
    #GET FIGURES???
    for figure in j.get('figures', []):
        report_uri = "%s/report/report_%s.json" %(dump_dir, figure['report_identifier'])
        chapter_uri = "%s/chapter/%s/report_%s_chapter_%s.json" % (dump_dir, figure['report_identifier'], figure['report_identifier'], figure['chapter_identifier'])
        figure_uri = "%s/figure/figure_%s.json" % (dump_dir, figure['identifier'])

        

        # create report
        #r = requests.get('%s%s.json' % (gcis_url, report_uri))
        #r.raise_for_status()
        #report = r.json()
        with open(report_uri) as reportJson:
            reportJson = json.open(report_uri)
        
        report_id = GCIS["%s" % report_uri[1:].replace('/', '-')]
        if report_id not in reports:
            doc.entity(report_id, [
                ( PROV_TYPE, GCIS['Report'] ),
                ( PROV_LABEL, report['title'] ),
                ( PROV_LOCATION, report['url'] ),
            ])
            reports.append(report_id)

        # create chapter
        r = requests.get('%s%s%s.json' % (gcis_url, report_uri, chapter_uri))
        if r.status_code != 200:
            print("Failed with %d code: %s" % (r.status_code, r.content))
            continue
        r.raise_for_status()
        chapter = r.json()
        chapter_id = GCIS["%s" % chapter_uri[1:].replace('/', '-')]
        if chapter_id not in chapters:
            doc.entity(chapter_id, [
                ( PROV_TYPE, GCIS['Chapter'] ),
                ( PROV_LABEL, chapter['title'] ),
                ( PROV_LOCATION, chapter['url'] ),
            ])
            chapters.append(chapter_id)
        doc.hadMember(report_id, chapter_id)
         
        # create findings
        r = requests.get('%s%s%s/finding.json' % (gcis_url, report_uri, chapter_uri))
        r.raise_for_status()
        for f in r.json():
            finding_id = GCIS["%s" % f['identifier']]
            if finding_id not in findings:
                doc.entity(finding_id, [
                    ( PROV_TYPE, GCIS['Finding'] ),
                    ( PROV_LABEL, f['identifier'] ),
                    ( PROV_LOCATION, f['href'] ),
                ])
                findings.append(finding_id)
            doc.hadMember(report_id, finding_id)
            doc.hadMember(chapter_id, finding_id)
         
        # create figure
        r = requests.get('%s%s%s%s.json' % (gcis_url, report_uri, chapter_uri, figure_uri))
        r.raise_for_status()
        figure_md = r.json()
        figure_id = GCIS["%s" % figure_uri[1:].replace('/', '-')]
        if figure_id not in figures:
            doc.entity(figure_id, [
                ( PROV_TYPE, GCIS['Figure'] ),
                ( PROV_LABEL, figure_md['title'] ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, figure_md['uri']) ),
            ])
            figures.append(figure_id)
            doc.hadMember(chapter_id, figure_id)
        doc.hadMember(figure_id, img_id)

    # create agents or organizations
    agent_ids = {}
    org_ids = {}
    for cont in j.get('contributors', []):
        # replace slashes because we get prov.model.ProvExceptionInvalidQualifiedName errors
        agent_id = GCIS["%s" % cont['uri'][1:].replace('/', '-')]

        # create person
        if len(cont['person']) > 0:
            # agent 
            agent_name  = " ".join([cont['person'][i] for i in
                                   ('first_name', 'middle_name', 'last_name')
                                   if cont['person'].get(i, None) is not None])
            doc.agent(agent_id, [
                ( PROV_TYPE, GCIS["Person"] ),
                ( PROV_LABEL, agent_name ),
                ( PROV_LOCATION, "%s%s" % (gcis_url, cont['uri']) ),
            ])
            agent_ids[agent_id] = []

        # organization
        if len(cont['organization']) > 0:
            org = cont['organization']
            org_id = GCIS["%s" % cont['organization']['identifier']]
            if org_id not in org_ids:          
                doc.governingOrganization(org_id, cont['organization']['name'])
                org_ids[org_id] = True
            if agent_id in agent_ids: agent_ids[agent_id].append(org_id)

    # create activity
    start_time = j['create_dt']
    end_time = j['create_dt']
    for parent in j.get('parents', []):
        input_id = GCIS["%s" % parent['url'][1:].replace('/', '-')]
        input_name = parent['label']
        doc.entity(input_id, [
            ( PROV_TYPE, GCIS["Dataset"] ),
            ( PROV_LABEL, input_name ),
            ( PROV_LOCATION, "%s%s" % (gcis_url, parent['url']) ),
        ])
        # some activity uri's are null
        if parent['activity_uri'] is None:
            act_id = GCIS["derive-from-%s" % input_id]
        else:
            act_id = GCIS["%s" % parent['activity_uri'][1:].replace('/', '-')]
        attrs = []
        for agent_id in agent_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, agent_id))]
            doc.wasAssociatedWith(act_id, agent_id, None, waw_id, {'prov:role': GCIS['Contributor']})
            for org_id in agent_ids[agent_id]:
                del_id = GCIS["%s" % get_uuid("%s:%s:%s" % (agent_id, org_id, act_id))]
                doc.delegation(agent_id, org_id, act_id, del_id, {'prov:type': GCIS['worksAt']})
        for org_id in org_ids:
            waw_id = GCIS["%s" % get_uuid("%s:%s" % (act_id, org_id))]
            doc.wasAssociatedWith(act_id, org_id, None, waw_id, {'prov:role': GCIS['Funder']})
        act = doc.activity(act_id, start_time, end_time, attrs)
        doc.used(act, input_id, start_time, GCIS["%s" % get_uuid("%s:%s" % (act_id, input_id))])
        doc.wasGeneratedBy(img_id, act, end_time, GCIS["%s" % get_uuid("%s:%s" % (img_id, act_id))])
           
    # serialize
    prov_json = json.loads(doc.serialize())

    # for hadMember relations, add prov:type
    for hm_id in prov_json.get('hadMember', {}):
        hm = prov_json['hadMember'][hm_id]
        col = hm['prov:collection'] 
        ent = hm['prov:entity'] 
        if col in reports and ent in chapters:
            hm['prov:type'] = GCIS['hasChapter']
        elif col in chapters and ent in figures:
            hm['prov:type'] = GCIS['hasFigure']
        elif col in figures and ent == img_id:
            hm['prov:type'] = GCIS['hasImage']

    return prov_json