#!/usr/bin/env python import unicodecsv import sys from decimal import Decimal in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8') out_csv = unicodecsv.DictWriter(sys.stdout, fieldnames=in_csv.fieldnames, encoding='utf-8') out_csv.writeheader() try: for line in in_csv: line['service_std_target'] = "%0.2f" % ( Decimal(line['service_std_target']) / 100) out_csv.writerow(line) except KeyError: if 'warehouse' in sys.argv: sys.exit(85) else: raise
return 'Público' print 'SEM FONTE:' + text return 'no source' annotators = ['hanna', 'cristina', 'cfreitas', 'ccarvalho', 'andrea'] fieldnames = [ 'texto', 'fonte', 'sim_ironico', 'nao_ironico', 'naosei_ironico', 'num_de_anotadores_total', 'Comparação', 'Hipérbole', 'Imparidade', 'Metáfora', 'Paradoxo', 'Vulgarismo', 'Outro', 'Sem Evidência' ] #fieldnames = ['texto', 'fonte', 'ironico', 'num_de_anotadores_ironico', 'num_de_anotadores_total', 'Comparação', 'Hipérbole', 'Imparidade', 'Metáfora', 'Paradoxo', 'Vulgarismo', 'Outro', 'Sem Evidência'] #output = codecs.open('annotated_10_INST.txt','w','utf-8') #output = codecs.open('annotation_stats/data.txt','wb','utf-8') output = open('data_all.csv', 'wb') csvw = unicodecsv.DictWriter(output, delimiter='\t', fieldnames=fieldnames) final = dict() filename = 'block_0_IRONIA' for an in annotators: try: with open('express_precuration/annotation/' + filename + '.tcf/' + an + '.tsv') as tsv: for line in csv.reader(tsv, dialect="excel-tab"): if len(line) > 0: if line[0].startswith('#text='): text = line[0].rsplit('[ [', 1)[0] text = text.replace('#text=', '').decode('utf-8') if not final.has_key(text): origin = checkOrigin(text) final[text] = {
def get(self, request, number): """ Creates a CSV for the order. The structure of the CSV looks like this: > Order Number:,EDX-100001 > Seat in Demo with verified certificate (and ID verification) > Code,Redemption URL > J4HDI5OAUGCSUJJ3,ecommerce.server?code=J4HDI5OAUGCSUJJ3 > OZCRR6WXLWGAFWZR,ecommerce.server?code=OZCRR6WXLWGAFWZR > 6KPYL6IO6Y3XL7SI,ecommerce.server?code=6KPYL6IO6Y3XL7SI > NPIJWIKNLRURYVU2,ecommerce.server?code=NPIJWIKNLRURYVU2 > 6SZULKPZQYACAODC,ecommerce.server?code=6SZULKPZQYACAODC > Args: request (Request): The GET request number (str): Number of the order Returns: HttpResponse Raises: Http404: When an order number for a non-existing order is passed. PermissionDenied: When a user tries to download a CSV for an order that he did not make. """ try: order = Order.objects.get(number=number) except Order.DoesNotExist: raise Http404('Order not found.') if request.user != order.user and not request.user.is_staff: raise PermissionDenied file_name = 'Enrollment code CSV order num {}'.format(order.number) file_name = '{filename}.csv'.format(filename=slugify(file_name)) response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename={filename}'.format( filename=file_name) redeem_url = get_ecommerce_url(reverse('coupons:offer')) voucher_field_names = ('Code', 'Redemption URL', 'Name Of Employee', 'Date Of Distribution', 'Employee Email') voucher_writer = csv.DictWriter(response, fieldnames=voucher_field_names) writer = csv.writer(response) writer.writerow(('Order Number:', order.number)) writer.writerow([]) order_line_vouchers = OrderLineVouchers.objects.filter( line__order=order) for order_line_voucher in order_line_vouchers: writer.writerow([order_line_voucher.line.product.title]) voucher_writer.writeheader() for voucher in order_line_voucher.vouchers.all(): voucher_writer.writerow({ voucher_field_names[0]: voucher.code, voucher_field_names[1]: '{url}?code={code}'.format(url=redeem_url, code=voucher.code) }) writer.writerow([]) return response
def write_csv(outfile, fieldnames, data): with open(outfile, 'wb') as open_outfile: csvfile = csv.DictWriter(open_outfile, fieldnames) csvfile.writeheader() csvfile.writerows(data)
final_data = [] header_name = [] for each_data in process: reduced = {} reduce_the_item(tool_name, each_data) header_name += reduced.keys() final_data.append(reduced) header_name = list(set(header_name)) header_name.sort() fpointer3.close() ''' The contents of the collated json file are written into a csv file ''' with open(csvfilename, 'a+') as fpointer4: w = csv.DictWriter(fpointer4, header_name, quoting=csv.QUOTE_ALL, encoding='utf-8') w.writeheader() for each_line in final_data: w.writerow(each_line) ''' Success message is printed ''' print("The csv and the collated json are successfully generated !")
#!/usr/bin/env python import unicodecsv import sys import codecs FIELDNAMES = 'ref_number,name,title_en,title_fr,description_en,description_fr,start_date,end_date,employee_attendees,guest_attendees,location_en,location_fr,total,owner_org,owner_org_title'.split(',') assert sys.stdin.read(3) == codecs.BOM_UTF8 in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8') sys.stdout.write(codecs.BOM_UTF8) out_csv = unicodecsv.DictWriter(sys.stdout, fieldnames=FIELDNAMES, encoding='utf-8') out_csv.writeheader() try: for line in in_csv: try: line['employee_attendees'] = str(int(line.pop('attendees'))) except ValueError: line['employee_attendees'] = '0' line['guest_attendees'] = '0' out_csv.writerow(line) except KeyError: if 'warehouse' in sys.argv: sys.exit(85) else: raise
def analyze_course_content( course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, pin_date=None, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split( '/', 1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % ( tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([ ('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses ]) sql = "select * from {course_tables}".format( course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table( dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max( [x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime( 2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float( entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci[ 'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = { 'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) listings_keys = map(make_key, [ "Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = [ 'discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion' ] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith( 'listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % ( course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [json.loads(x) for x in open(listings_file)] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = {x['name']: x for x in the_schema} # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-" * 60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None if listings_file.endswith('.json'): data_feed = map(json.loads, open(lfn)) else: data_feed = csv.DictReader(open(lfn)) for k in data_feed: if not 'course_id' in k: print "Strange course listings row, no course_id in %s" % k raise Exception("Missing course_id") if k['course_id'] == course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % ( course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) if pin_date: datedir = pin_date course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest and not pin_date) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % ( xbfn, course_id) if use_dataset_latest: # try looking in earlier directories for xbundle file import glob spath = course_dir / ("../*/xbundle_%s.xml" % cfn) files = list(glob.glob(spath)) if files: xbfn = path(files[-1]) if not xbfn.exists(): print " --> also cannot find any %s ; aborting!" % spath else: print " --> Found and using instead: %s " % xbfn if not xbfn.exists(): raise Exception("[analyze_content] missing xbundle file %s" % xbfn) # if there is an xbundle*.fixed file, use that instead of the normal one if os.path.exists(str(xbfn) + ".fixed"): xbfn = path(str(xbfn) + ".fixed") print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = [ 'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title' ] problem_stats = defaultdict(int) def does_problem_have_random_script(problem): ''' return 1 if problem has a script with "random." in it else return 0 ''' for elem in problem.findall('.//script'): if elem.text and ('random.' in elem.text): return 1 return 0 # walk through xbundle def walk_tree(elem, policy=None): ''' Walk XML tree recursively. elem = current element policy = dict of attributes for children to inherit, with fields like due, graded, showanswer ''' policy = policy or {} if type(elem.tag) == str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 if elem.tag in [ "sequential", "problem", "problemset", "course", "chapter" ]: # very old courses may use inheritance from course & chapter keys = ["due", "graded", "format", "showanswer", "start"] for k in keys: # copy inheritable attributes, if they are specified val = elem.get(k) if val: policy[k] = val if elem.tag == "problem": # accumulate statistics about problems: how many have show_answer = [past_due, closed] ? have random. in script? problem_stats['n_capa_problems'] += 1 if policy.get('showanswer'): problem_stats["n_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118 # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed. problem_stats[ 'n_random_script'] += does_problem_have_random_script(elem) if policy.get('graded') == 'true' or policy.get( 'graded') == 'True': problem_stats['n_capa_problems_graded'] += 1 problem_stats[ 'n_graded_random_script'] += does_problem_have_random_script( elem) if policy.get('showanswer'): problem_stats["n_graded_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_graded_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: try: print " -> excluding %s (%s), ncount=%s" % ( k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) except Exception as err: print " -> excluding ", k continue walk_tree(k, policy.copy()) walk_tree(xml) print "--> Count of individual element tags throughout XML: ", counts print "--> problem_stats:", json.dumps(problem_stats, indent=4) # combine some into "qual_axis" and others into "quant_axis" qual_axis = [ 'openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti' ] quant_axis = [ 'formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse' ] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % ( nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = { 'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week': total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = [ 'course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks' ] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update( {make_key('listings_%s' % key): value for key, value in data.items()}) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts cmfields['filename_xbundle'] = xbfn cmfields['filename_listings'] = lfn for key in sorted( counts ): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts for key in sorted(problem_stats): # store problem stats value = problem_stats[key] cmfields['problem_stat_%s' % key] = str(value) cmfields.update({('nexcluded_sub_20_%s' % key): str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id'] == course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() # build and output course_listings_and_metainfo dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) mypath = os.path.dirname(os.path.realpath(__file__)) clm_table = "course_listing_and_metainfo" clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table) clm_schema = json.loads(open(clm_schema_file).read()) clm = {} for finfo in clm_schema: field = finfo['name'] clm[field] = cmfields.get(field) clm_fnb = clm_table + ".json" clm_fn = course_dir / clm_fnb open(clm_fn, 'w').write(json.dumps(clm)) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / clm_fnb print "--> Course listing + metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, clm_table) sys.stdout.flush() gsutil.upload_file_to_gs(clm_fn, gsfnp) bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False) # output course_metainfo table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, table) sys.stdout.flush() gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def test_encode_error_dictwriter(self): fd = BytesIO() dw = csv.DictWriter(fd, ['col1'], encoding='cp1252', errors='xmlcharrefreplace') dw.writerow({'col1': chr(2604)}) self.assertEqual(fd.getvalue(), b'ਬ\r\n')
#encoding=utf-8 import django django.setup() import requests from bs4 import BeautifulSoup, element import unicodecsv as csv import codecs dict_list = [] for i in range(1, 257): url = u'http://www.nechama.org.il/commentatorsPopup/{}.html'.format(i) r = requests.get(url, ) data = r.content content = BeautifulSoup(data, "lxml") title = content.find(attrs={'id': 'contentTop'}).get_text() text = content.find(attrs={'id': 'contentBody'}).get_text() dict_list.append({u'number': i, u'name': title, u'text': text}) with open('parshanim.csv', 'w') as csv_file: writer = csv.DictWriter(csv_file, [u'number', u'name', u'text']) writer.writeheader() writer.writerows(dict_list) print "done"
def __init__(self, data_file): self._csv_writer = unicodecsv.DictWriter(data_file, fieldnames=_REQUIRED_COLUMNS, lineterminator="\n") self._csv_writer.writeheader()
def _make_csv(abbr, name, fields): filename = '/tmp/{0}_{1}'.format(abbr, name) f = unicodecsv.DictWriter(open(filename, 'w'), fields) f.writerow(dict(zip(fields, fields))) return filename, f
def getWordLinks(args): print("Getting .docx Links") # Handle arguments and flags parser = argparse.ArgumentParser(usage=instructions, add_help=False) parser.add_argument("--help", "-h", action="store_true") parser.add_argument("-r", action="store_true") parser.add_argument("-l", action="store_true") parser.add_argument("-o", action="store") parser.add_argument("file_names", nargs="*") args = parser.parse_args(args) # Replace arguments with wildcards with their expansion. # If a string does not contain a wildcard, glob will return it as is. # Mostly important if we run this on Windows systems. file_names = list() for name in args.file_names: file_names += glob.glob(glob.escape(name)) # If the filenames don't exist, say so and quit. if file_names == []: sys.exit("No file or directory found by that name.") # Don't run the script on itself. if sys.argv[0] in file_names: file_names.remove(sys.argv[0]) if args.help: sys.exit(instructions) filecount = 0 linklist = [] target_is_folder = False for name in file_names: # Make sure single files exist. assert os.path.exists(name), "File or directory not found." # If it's just a file... if os.path.isfile(name): # Make sure this is a Word file (just check extension) if name.lower().endswith(".docx") or name.lower().endswith( ".docm"): # Get links from that file. linklist.extend(getLinks(name, args, False)) filecount += 1 # If it's a directory: if os.path.isdir(name): target_is_folder = True # Recursive version using os.walk for all levels. if args.r: for dirpath, dirnames, files in os.walk(name): for eachfile in files: # Get links for every file in that directory. if eachfile.lower().endswith( ".docx") or eachfile.lower().endswith(".docm"): linklist.extend(getLinks(eachfile, args, dirpath)) filecount += 1 # Non-recursive version breaks os.walk after the first level. else: topfiles = [] for (dirpath, dirnames, files) in os.walk(name): topfiles.extend(files) break for eachfile in topfiles: if eachfile.lower().endswith( ".docx") or eachfile.lower().endswith(".docm"): linklist.extend(getLinks(eachfile, args, dirpath)) filecount += 1 # When called by other scripts, quietly return the list and stop. if args.l: return linklist # Otherwise, output a file and print some info. print("\nChecked " + str(filecount) + " .docx file" + ("s" if filecount > 1 else "") + " for links.") # Create output file as sibling to the original target of the script. outFileName = args.o if args.o else "Word_Doc_Links.csv" if target_is_folder: outFileFolder = os.path.abspath(os.path.join(file_names[0], os.pardir)) outFilePath = os.path.join(outFileFolder, outFileName) else: outFilePath = os.path.join(os.path.dirname(file_names[0]), outFileName) with open(outFilePath, "wb") as outputFile: fieldnames = ["filename", "href", "text"] writer = csv.DictWriter(outputFile, fieldnames=fieldnames, extrasaction="ignore") writer.writeheader() for row in linklist: writer.writerow(row) print("Spreadsheet created: " + outFileName) print("Location: " + outFilePath)
except ValueError: pass else: raise ValueError for fmt in formats: try: return datetime.strptime(d, fmt) except ValueError: pass return from_excel(int(d)) in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8') out_csv = unicodecsv.DictWriter(sys.stdout, fieldnames=FIELDNAMES, encoding='utf-8') out_csv.writeheader() err_csv = None original = None line = None if sys.argv[1:]: err_csv = unicodecsv.DictWriter(open(sys.argv[1], 'wb'), fieldnames=in_csv.fieldnames, encoding='utf-8') err_csv.writeheader() def error(msg, value=None): sys.stderr.write(line['owner_org'] + ' ' + line['ref_number'] + ' ' + msg +
if e['title'] == '"Lady Marmalade"' and "Christina" in e['artists']: row['title'] = '"Lady Marmalade (Moulin Rouge)"' prev_artist = e['artists'] res_updated.append(row) # Consolidate titles res_by_title = defaultdict(dict) for e in res_updated: row = {} row['artists'] = e['artists'] row['title'] = e['title'] row['entry_{0}'.format(e['entry'])] = e['date'] row['weeks_{0}'.format(e['entry'])] = e['weeks'] res_by_title[e['title'] + e['artists']].update(row) # Return to list form res_final = [] for key, value in res_by_title.items(): res_final.append(value) # Prepare fields for csv export. csv_fields = [ 'title', 'artists', 'entry_1', 'entry_2', 'entry_3', 'weeks_1', 'weeks_2', 'weeks_3' ] # Export csv with open('billboard_wiki.csv', 'wb') as f: writer = unicodecsv.DictWriter(f, csv_fields) writer.writeheader() writer.writerows(res_final)
return properties_list if __name__ == "__main__": argparser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) argparser.add_argument('zipcode', help='') sortorder_help = """ available sort orders are : newest : Latest property details, cheapest : Properties with cheapest price """ argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') args = argparser.parse_args() zipcode = args.zipcode sort = args.sort print("Fetching data for %s" % (zipcode)) scraped_data = parse(zipcode, sort) print("Writing data to output file") with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: fieldnames = [ 'title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in scraped_data: writer.writerow(row)
continue source_id = arg posts = soup.findAll('div', attrs={'class': '_5pcr userContentWrapper'}) output = "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><title>Extraction results</title></head>\n<body><table border=1 style='font-size:13px;border-collapse:collapse;table-layout:fixed;width:1300px;word-break:break-all'><tr><td style='width:30px'><center>#</center></td><td style='width:120px;'>Post id</td><td style='width:100px;'>Time_published</td><td style='width:100px;'>Author name</a></td><td style='width:100px;'>Author ID</td><td style='width:300px'>Post message</td><td style='width:45px'><center>Shared<br> as</center></td><td style='width:25px'><center>#<br>pics</center></td><td style='width:100px;'><center>Pics</center></td><td style='width:25px'><center>#<br>vids</center></td><td style='width:100px'><center>Vids</center></td><td style='width:30px'><center>#<br>links</center></td><td style='width:40px'><center>Links</center></td><td style='width:40px'><center>Reacts</center></td><td style='width:40px'><center>Like</center></td><td style='width:40px'><center>Love</center></td><td style='width:40px'><center>Haha</center></td><td style='width:40px'><center>Angry</center></td><td style='width:40px'><center>Sad</center></td><td style='width:40px'><center>Wow</center></td><td style='width:40px'><center>Shares</center></td><td style='width:40px'><center>Comments</center></td></tr>" with open(arg + ".csv", 'wb') as csvfile: fieldnames = [ 'source_id', 'post_id', 'post_url', 'created_time', 'author_name', 'author_id', 'msg', 'shared_as', 'pic_count', 'pics', 'vid_count', 'vids', 'link_count', 'links', 'reactions', 'like', 'love', 'haha', 'angry', 'sad', 'wow', 'shares', 'comment_count' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=',', lineterminator='\n') writer.writeheader() index = 0 regex1 = re.compile('.*\.php\?id\=.+?\&.*') regex2 = re.compile('.*_5pbx.*') regex3 = re.compile('.*scaledImageFit.*') regex4 = re.compile('async.*') img_regex = re.compile('_s0 _4ooo _\d.*?_rw img') for post in posts: index = index + 1 try: #print "trying "+str(index) post_id = post.find('div',
'Hierarchies (count)': len(publisher_stats['hierarchies']), 'Hierarchies': ';'.join(publisher_stats['hierarchies']), } with open(os.path.join('out', 'publishers.csv'), 'w') as fp: writer = unicodecsv.DictWriter(fp, [ 'Publisher Name', 'Publisher Registry Id', 'Activities', 'Organisations', 'Files', 'Activity Files', 'Organisation Files', 'Total File Size', 'Reporting Org on Registry', 'Reporting Orgs in Data (count)', 'Reporting Orgs in Data', 'Hierarchies (count)', 'Hierarchies', ]) writer.writeheader() for d in publisher_dicts(): writer.writerow(d) publishers = data.current_stats['inverted_publisher']['activities'].keys() with open(os.path.join('out', 'elements.csv'), 'w') as fp: writer = unicodecsv.DictWriter(fp, ['Element'] + publishers)
def runchart(coinObj): plt.xticks(rotation = 90) # xs = df['date'][:100] # print (xs) # ys = df['price'][:100] # print (ys) print (list(coinObj.keys())[0]) xs = coinObj[list(coinObj.keys())[0]]['date'][:100] xs = pd.to_datetime(xs) #print(xs) #coinList = list(coinObj) # for idx, coinNM in enumerate(coinList): chart_idx = 1 print('len',len(coinObj)) # coinList = [] # coinNmList = [] for coinNM in coinObj: # coninList = coinObj[coinNM] print (coinObj[coinNM]) xs = coinObj[coinNM]['date'] xst = pd.to_datetime(xs) ys = coinObj[coinNM]['price'] coinObjlen = len(coinObj) #chart create subnum = coinObjlen *100 +10 + chart_idx plt.subplot(subnum) plt.ylabel(coinNM) plt.xlabel('Exposure Time') plt.plot(xst, ys ,label=coinNM) plt.grid(True) plt.legend(loc='best', title=coinNM) chart_idx += 1 meme_buy_count = 1; meme_sell_count = 1; memelist = [] for idx, val in enumerate(ys): print (idx, val ,ys[idx]) mobj = collections.OrderedDict() mobj['date'] ='' mobj['price'] ='' mobj['buy_volume'] = 0 mobj['sell_volume'] = 0 mobj['sum_buy'] = 0 mobj['sum_sell'] = 0 mobj['sum_volume'] = 0 if idx < len(ys) - 3: # buy if ys[idx] > ys[idx + 1]: if ys[idx + 1] > ys[idx + 2]: if ys[idx + 2] > ys[idx + 3]: # if not memelist or memelist[-1]['sell_volume'] ==1: # print('b', meme_buy_count, str(idx - 1) + '>' + str(idx), ys[idx]) meme_buy_count += 1 plt.scatter(xst[idx], ys[idx], color='g', marker='^') mobj['buy_volume'] = 1 mobj['sum_buy'] = ys[idx] * mobj['buy_volume'] # memelist.append(mobj) # coinObj[coinNM][idx]['buy'] = 1 # else: # coinObj[coinNM][idx]['buy'] = 0 # sell if ys[idx] < ys[idx + 1]: if ys[idx + 1] < ys[idx + 2]: if ys[idx + 2] < ys[idx + 3]: # if memelist[-1]['buy_volume'] ==1: print('s', meme_sell_count, str(idx - 1) + '>' + str(idx), ys[idx]) meme_sell_count += 1 plt.scatter(xst[idx], ys[idx], color='r', marker='v') mobj['sell_volume'] = 1 if memelist: mobj['sum_sell'] = ys[idx] * mobj['sell_volume'] mobj['date'] = xs[idx] mobj['price'] = ys[idx] if memelist : mobj['sum_volume'] = memelist[-1]['sell_volume']+mobj['buy_volume']- mobj['sell_volume'] #mobj['sell_volume'] = memelist[-1]['sell_volume']+mobj['sell_volume'] #mobj['buy_volume'] = memelist[-1]['buy_volume']+ mobj['buy_volume'] memelist.append(mobj) print(memelist[:100]) keys = memelist[0].keys() with open('t/'+coinNM+'_trade.csv', 'wb') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(memelist) plt.xticks(rotation=90) plt.show()
def handle(self, **options): output = options['output'] from froide.publicbody.models import PublicBody from froide.foirequest.models import FoiRequest year = 2015 def get_status(status): KNOWN_STATUS = ( 'successful', 'partially_successful', 'refused', ) MAP_STATUS = {} if status in KNOWN_STATUS: return status return MAP_STATUS.get(status, 'other') def convert_value(val): if not val: return 0 else: return int(val) def stats_for_queryset(qs, key=None): status_counter = Counter() for r in qs: arg = key if arg is None: arg = r.public_body.name status_counter[get_status(r.status)] += 1 return status_counter output = open(output, 'w') writer = unicodecsv.DictWriter(output, ('name', 'gb', 'year', 'total_count'), encoding='utf-8') writer.writeheader() short_names = [ "BK", "BMAS", "AA", "BMI", "BMJV", "BMF", "BMWi", "BMEL", "BMVg", "BMFSFJ", "BMG", "BMVI", "BMUB", "BMBF", "BKM", "BMZ", "BPA", "BPräsA", "BT", "BR", "BBank", "BfDI", "BRH", 'BVerfG' ] for year in range(2011, 2016): for short_name in short_names: print(short_name) try: root_pb = PublicBody.objects.get( jurisdiction_id=1, other_names__contains='%s,' % short_name ) except PublicBody.DoesNotExist: print('missing') continue root_count = root_pb.foirequest_set.filter(first_message__year=year, is_foi=True).count() pbs = PublicBody.objects.filter(root=root_pb) qs = FoiRequest.objects.filter(first_message__year=year, public_body__in=pbs, is_foi=True) total_count = len(list(qs)) writer.writerow({ 'name': short_name, 'year': year, 'gb': 'True', 'total_count': total_count, }) writer.writerow({ 'name': short_name, 'year': year, 'gb': 'False', 'total_count': root_count, })
def write_headers(self): self.csv_writer = csv.DictWriter(self.fout, self._headers) self.csv_writer.writeheader()
def main(): model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS) f = open('test_data.csv','wb') w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\") w.writeheader() # parse the risk of bias data from Cochrane data = riskofbias.RoBData(test_mode=False) data.generate_data(doc_level_only=False) docs = riskofbias.MultiTaskSentFilter(data) uids = np.array(docs.get_ids()) no_studies = len(uids) kf = KFold(no_studies, n_folds=5, shuffle=False) tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]} vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space for k_i, (train, test) in enumerate(kf): if k_i == 1: break y_train = docs.y(uids[train]) vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions X_train = vec.builder_fit_transform() clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16) # import pdb; pdb.set_trace() clf.fit(X_train, y_train) del X_train, y_train clf = clf.best_estimator_ # and we only need the best performing, discard the rest # Test on each domain in turn # filtered_data = riskofbias.SentFilter(data) for domain in riskofbias.CORE_DOMAINS: print "Testing on %s" % domain vec.builder_clear() vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions X_test = vec.builder_transform() y_test = docs.y(uids[test], domain=domain) y_preds = clf.predict(X_test) y_df = clf.decision_function(X_test) # get distances from the decision boundary # positive distances = more likely to be relevant sentences r_len = len(y_preds) y_top3 = [] y_top1 = [] y_rand = [] y_uids = np.array(docs.y_uids(uids[test], domain=domain)) # import pdb; pdb.set_trace() for y_uid in np.unique(y_uids): mask = np.where(y_uids == y_uid)[0] doc_df = y_df[mask] doc_top3 = np.argpartition(doc_df, -3)[-3:] y_top3.extend(list(mask[doc_top3])) doc_top1 = np.argmax(doc_df) y_top1.append(mask[doc_top1]) doc_rand = np.random.randint(0, len(doc_df)) y_rand.append(mask[doc_rand]) human_sent_indices = np.where(y_test==1)[0] algorithm_sent_indices = np.where(y_preds==1)[0] model_metrics.add_preds_test(y_preds, y_test, domain=domain) stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain) # import pdb; pdb.set_trace() for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))): row = {"domain": domain, "sent_text": doc, "random": doc_i in y_rand, "human": doc_i in human_sent_indices, "algorithm": doc_i in algorithm_sent_indices, "top3": doc_i in y_top3, "top1": doc_i in y_top1, "pmid": pmid} if row["random"] or row["human"] or row["top3"] or row["top1"]: # please note, the sentences will only be included in the analysis if # in the top1 or top3 # we do have data on whether the raw classifier has predicted yes/no # # this in effect means where the classifier picks <= 3 sentences # we use all raw classifier data # where >3 sentences are predicted by raw classifier, only the # top 3 are used; the rest are discarded w.writerow(row) del X_test, y_test, y_preds del clf model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model"))) stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline"))) f.close()
#!/usr/bin/python from lxml import etree import unicodecsv as csv departements = ['03', '15', '19', '23', '43', '63'] circos = ['01', '02', '03', '04', '05'] ficsv = open('lg017_propor_tour1.csv', 'w') try: fieldnames = [] fieldnames.extend([ 'circo', 'inscrits', 'votants', 'exprimes', 'abstentions', 'blancs', 'nuls', 'EXG', 'COM', 'FI', 'SOC', 'RDG', 'DVG', 'ECO', 'DIV', 'REG', 'REM', 'MDM', 'UDI', 'LR', 'DVD', 'DLF', 'FN', 'EXD' ]) majcsv = csv.DictWriter(ficsv, fieldnames=fieldnames) majcsv.writeheader() for dep in departements: for circo in circos: try: arbre = etree.parse( "http://elections.interieur.gouv.fr/telechargements/LG2017/resultatsT1/0" + dep + "/0" + dep + circo + ".xml") print("dep OK") for noeud in arbre.xpath( "//Election/Departement/Circonscription"): objet = {} for circ in noeud.xpath("CodCirLg"): objet["circo"] = dep + circ.text for resultats in noeud.xpath("Tours/Tour[NumTour=1]"): for inscrits in resultats.xpath(
print(file=sys.stderr) print('writing courses...', file=sys.stderr) # print(os) # print(oc) for oid, sids in os.iteritems(): oname = get_offering_name(oid) for sid in sids: s = get_section(oid, sid) if not s: continue s['name'] = oname s['catalogs'] = ';'.join(oc[oid]) yield s time.sleep(0.2) # be a good boy; don't stress the server if __name__ == '__main__': writer = unicodecsv.DictWriter(sys.stdout, [ 'id', 'name', 'location', 'start', 'end', 'day', 'time', 'cost', 'credit', 'instructors', 'catalogs', 'link' ], delimiter='\t') writer.writeheader() for s in get_all_sections(): writer.writerow(s) print('done', file=sys.stderr)
status = "Destroyed" if status == "Created": location = line.split(" at ")[1].split(" created")[0] elif status == "Destroyed": location = line.split(" at ")[1].split(" destroyed ")[0] rally = {"Datetime": date_val, "Location": location, "Status": status} outdata.append(rally) #print(current_map) keys = outdata[0].keys() with open('rally_point_location.csv', 'wb') as out_file: dict_writer = csv.DictWriter(out_file, keys) dict_writer.writeheader() dict_writer.writerows(outdata) # Get player kill info #[2018.08.31-04.55.10:064][730]LogSquad: Player:[LLJK] ☢Riyott.exe☢ ActualDamage=186.000015 from oldstastic2011 caused by BP_M4_M68_C_20 #[2018.08.31-04.55.35:645][ 2]LogSquad: ScorePoints: Points: -1.000000 ScoreEvent: TeamKilled Jordan Reagan regex = r"(Player:.*) (ActualDamage=.*) from (.*) caused by (.*)" kill_logs = []
def getWordLinks(args): # Handle arguments and flags parser = argparse.ArgumentParser(usage=instructions, add_help=False) parser.add_argument('--help', '-h', action='store_true') parser.add_argument('-r', action='store_true') parser.add_argument('-l', action='store_true') parser.add_argument('file_names', nargs='*') args = parser.parse_args(args) # Replace arguments with wildcards with their expansion. # If a string does not contain a wildcard, glob will return it as is. # Mostly important if we run this on Windows systems. file_names = list() for name in args.file_names: file_names += glob(name) # If the filenames don't exist, say so and quit. if file_names == []: sys.exit('No file or directory found by that name.') # Don't run the script on itself. if sys.argv[0] in file_names: file_names.remove(sys.argv[0]) optionlist = [] if args.help: sys.exit(instructions) if args.r: optionlist.append('r') if args.l: optionlist.append('l') filecount = 0 linklist = [] target_is_folder = False for name in file_names: # Make sure single files exist. assert os.path.exists(name), "File or directory not found." # If it's just a file... if os.path.isfile(name): # Make sure this is an sjson file (just check extension) if name.lower().endswith('.docx'): # Convert it to an SRT file linklist.extend(getLinks(name, optionlist, False)) filecount += 1 # If it's a directory: if os.path.isdir(name): target_is_folder = True # Recursive version using os.walk for all levels. if 'r' in optionlist: for dirpath, dirnames, files in os.walk(name): for eachfile in files: # Convert every file in that directory. if eachfile.lower().endswith('.docx'): linklist.extend( getLinks(eachfile, optionlist, dirpath)) filecount += 1 # Non-recursive version breaks os.walk after the first level. else: topfiles = [] for (dirpath, dirnames, files) in os.walk(name): topfiles.extend(files) break for eachfile in topfiles: if eachfile.lower().endswith('.docx'): linklist.extend(getLinks(eachfile, optionlist, dirpath)) filecount += 1 # When called by other scripts, quietly return the list and stop. if 'l' in optionlist: return linklist # Otherwise, output a file and print some info. print('\nChecked ' + str(filecount) + ' .docx file' + ('s' if filecount > 1 else '') + ' for links.') # Create output file as sibling to the original target of the script. if target_is_folder: outFileFolder = os.path.abspath(os.path.join(file_names[0], os.pardir)) outFilePath = os.path.join(outFileFolder, 'Word_Doc_Links.csv') else: outFilePath = os.path.join(os.path.dirname(file_names[0]), 'Word_Doc_Links.csv') with open(outFilePath, 'wb') as outputFile: fieldnames = ['filename', 'url', 'linktext'] writer = csv.DictWriter(outputFile, fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() for row in linklist: writer.writerow(row) print 'Spreadsheet created: Word_Doc_Links.csv' print 'Location: ' + outFilePath
def writeCourseSheet(rootFileDir, rootFileName, course_dict, args): course_name = course_dict['name'] if args.links: course_name += ' Links' course_name += '.tsv' outFileName = args.o if args.o else course_name # Create a "csv" file with tabs as delimiters with open(os.path.join(rootFileDir, outFileName), 'wb') as outputfile: fieldnames = [ 'chapter', 'sequential', 'vertical', 'component', 'type', 'url' ] # Include the XML if we're dealing with problems if args.problems: fieldnames.append('inner_xml') # Include video data if we're dealing with videos if args.links: fieldnames = fieldnames + ['href', 'linktext'] # Include video data if we're dealing with videos if args.video: fieldnames = fieldnames + [ 'duration', 'sub', 'youtube', 'edx_video_id', 'upload_name' ] writer = csv.DictWriter(outputfile, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() spreadsheet = fillInRows(courseFlattener(course_dict)) for index, row in enumerate(spreadsheet): for key in row: spreadsheet[index][key] = spreadsheet[index][key] printable = [] if args.all: printable = spreadsheet else: if args.links: printable += [ row for row in spreadsheet if row['type'] in ['html', 'problem', 'xml', 'docx', 'pptx', 'xlsx'] ] if args.html: printable += [ row for row in spreadsheet if row['type'] == 'html' ] if args.video: printable += [ row for row in spreadsheet if row['type'] == 'video' ] if args.problems: printable += [ row for row in spreadsheet if row['type'] == 'problem' ] for row in printable: if args.links: if row['href'] != '': writer.writerow(row) else: writer.writerow(row) print('Spreadsheet created for ' + course_dict['name'] + '.') print('Location: ' + outFileName)
print("location id not available") except: print("Failed to load locations") if __name__ == "__main__": ''' eg-:python 1934_glassdoor.py "Android developer", "new york" ''' argparser = argparse.ArgumentParser() argparser.add_argument('keyword', help='job name', type=str) argparser.add_argument('place', help='job location', type=str) args = argparser.parse_args() keyword = args.keyword place = args.place print("Fetching job details") scraped_data = parse(keyword, place) print("Writing data to output file") with open('%s-%s-job-results.csv' % (keyword, place), 'wb') as csvfile: fieldnames = ['Salary'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() if scraped_data: for data in scraped_data: writer.writerow(data) else: print("Your search for %s, in %s does not match any jobs" % (keyword, place))
def create_neo4j_csv(results): """ Create csv's for use by the neo4j import tool. Relies on create_neo4j_ functions output and transforms it to suitable format for automatic importing. Input: - results: dic, json-style dictionary. Check create_neo4j_ function output for details Output: - None just saves the documents in the allocated path as defined in settings.yaml """ outpath = settings['out']['csv']['out_path'] entities_nodes = None articles_nodes = None relations_edges = None entity_pmc_edges = None other_nodes = [] other_edges = [] for nodes in results['nodes']: if nodes['type'] == 'Entity': entities_nodes = nodes['values'] elif nodes['type'] == 'Article': articles_nodes = nodes['values'] else: other_nodes.extend(nodes['values']) for edges in results['edges']: if edges['type'] == 'relation': relations_edges = edges['values'] elif edges['type'] == 'mention': entity_pmc_edges = edges['values'] elif edges['type'] == 'NEW': other_edges.extend(edges['values']) dic_ = { 'entities.csv': entities_nodes, 'articles.csv': articles_nodes, 'other_nodes.csv': other_nodes, 'entities_pmc.csv':entity_pmc_edges, 'relations.csv':relations_edges, 'other_edges.csv': other_edges } dic_fiels = { 'entities.csv': ['id:ID', 'label', 'sem_types:string[]'], 'articles.csv': ['id:ID', 'title', 'journal','sent_id:string[]'], 'other_nodes.csv': ['id:ID'], 'entities_pmc.csv':[':START_ID','score:float[]','sent_id:string[]', ':END_ID'], 'relations.csv':[':START_ID','subject_score:float[]','subject_sem_type:string[]',':TYPE','pred_type:string[]', 'object_score:float[]','object_sem_type:string[]','sent_id:string[]','negation:string[]',':END_ID'], 'other_edges.csv':[':START_ID', ':TYPE', ':END_ID'] } for k, toCSV in dic_.iteritems(): if toCSV: keys = toCSV[0].keys() out = os.path.join(outpath, k) with open(out, 'wb') as output_file: time_log("Created file %s" % k) dict_writer = csv2.DictWriter(output_file, fieldnames=dic_fiels[k], encoding='utf-8') dict_writer.writeheader() dict_writer.writerows(toCSV) time_log('Created all documents needed')
def open_csv(): sources = [] curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, curr_source, prev_rows = None, 0, None, 0, 0, None, [] with open("sefer_haagada.csv", "rb") as fin: csv = unicodecsv.DictReader(fin) for row in csv: # if len(sources) >= 30: # break if len(row["sourceNum"]) > 0: if len(prev_rows) > 0: sources += [ make_parsed_source(curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, prev_rows) ] prev_rows = [] # update chapter new_chapter = update_chapter(row) if new_chapter: if new_chapter[1] != curr_chapter_num + 1: print("{} <= {} {}".format(new_chapter[1], curr_chapter_num, curr_chapter_name)) curr_chapter_name, curr_chapter_num = new_chapter curr_topic_num = 0 curr_source_num = 0 # update topic new_topic = update_topic(row) if new_topic: if new_topic[1] != curr_topic_num + 1: print("{} <= {} {}".format(new_topic[1], curr_topic_num, curr_topic_name)) curr_topic_name, curr_topic_num = new_topic # update source num if len(row["sourceNum"]) > 0: new_source_num = gematria(row["sourceNum"]) if new_source_num != curr_source_num + 1: print("yoyoyo {} <= {} {} -- {}".format( new_source_num, curr_source_num, curr_topic_name, curr_topic_num)) curr_source_num = new_source_num prev_rows += [row] if len(prev_rows) > 0: sources += [ make_parsed_source(curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, prev_rows) ] sources = [_f for _f in sources if _f] sources = disambiguate_all(sources) with open("parsed.csv", "wb") as fout: csv = unicodecsv.DictWriter(fout, [ "chapter_name", "chapter_num", "topic_name", "topic_num", "source_num", "source", "commentary", "good_ref_list", "bad_ref_list", "ref_list" ]) csv.writeheader() for s in sources: s["ref_list"] = ", ".join( [r.normal() for r in s.get("ref_list", [])]) s["good_ref_list"] = ", ".join( [r.normal() for r in s.get("good_ref_list", [])]) s["bad_ref_list"] = ", ".join( [r.normal() for r in s.get("bad_ref_list", [])]) csv.writerows(sources) with open("topics.csv", "wb") as fout: unique_topics = [{ "chapter_name": x["chapter_name"], "topic_name": x["topic_name"] } for x in reduce( lambda a, b: a + ([b] if (len(a) == 0 or a[-1]['topic_name'] != b[ 'topic_name']) else []), sources, [])] csv = unicodecsv.DictWriter(fout, ["chapter_name", "topic_name"]) csv.writeheader() csv.writerows(unique_topics)
def test_dict_dict(): new_file = "avodah_zarah_little_letters.csv" ein_parser.run2("az_collapsed", "avodah_zarah") comp_file = "avodah_zarah_done_jan18.csv" new = [] comp = [] new_has_segments = False comp_has_segments = False with open(new_file, 'r') as csvfile: file_reader = csv.DictReader(csvfile) if "Line" in file_reader.fieldnames: new_has_segments = True for i, row in enumerate(file_reader): if not row: continue else: new_dict = { "EM": row["original"], "Rambam": row["Rambam"], "Semag": row["Semag"], "TurShA": row["Tur Shulchan Arukh"] } if new_has_segments: new_dict['segment'] = '{}.{}'.format( row['Daf'], row['Line']) new.append(new_dict) with open(comp_file, 'r') as csvfile: file_reader = csv.DictReader(csvfile) if "Line" in file_reader.fieldnames: comp_has_segments = True for i, row in enumerate(file_reader): if not row: continue else: new_dict = { "EM": row["original"], "Rambam": row["Rambam"], "Semag": row["Semag"], "TurShA": row["Tur Shulchan Arukh"] } if comp_has_segments: new_dict['segment'] = '{}.{}'.format( row['Daf'], row['Line']) comp.append(new_dict) missmatch_cnt = 0 with open(u'az_test_diff.csv', 'w') as csv_file: writer = csv.DictWriter(csv_file, [u'line', u'old', u'new', u'EM']) writer.writeheader() if new_has_segments: lineseg = "a['segment']" elif comp_has_segments: lineseg = "b['segment']" else: lineseg = "i" for i, (a, b) in enumerate(zip(new, comp)): # assert a == b for k in a.keys(): if a[k] != b[k]: writer.writerow({ u'line': eval(lineseg), u'new': a[k], u'old': b[k], u'EM': a['EM'] }) missmatch_cnt += 1 assert missmatch_cnt == 6