#!/usr/bin/env python

import unicodecsv
import sys
from decimal import Decimal

in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8')
out_csv = unicodecsv.DictWriter(sys.stdout,
                                fieldnames=in_csv.fieldnames,
                                encoding='utf-8')
out_csv.writeheader()

try:
    for line in in_csv:
        line['service_std_target'] = "%0.2f" % (
            Decimal(line['service_std_target']) / 100)
        out_csv.writerow(line)

except KeyError:
    if 'warehouse' in sys.argv:
        sys.exit(85)
    else:
        raise
Ejemplo n.º 2
0
        return 'Público'
    print 'SEM FONTE:' + text
    return 'no source'


annotators = ['hanna', 'cristina', 'cfreitas', 'ccarvalho', 'andrea']
fieldnames = [
    'texto', 'fonte', 'sim_ironico', 'nao_ironico', 'naosei_ironico',
    'num_de_anotadores_total', 'Comparação', 'Hipérbole', 'Imparidade',
    'Metáfora', 'Paradoxo', 'Vulgarismo', 'Outro', 'Sem Evidência'
]
#fieldnames = ['texto', 'fonte', 'ironico', 'num_de_anotadores_ironico', 'num_de_anotadores_total', 'Comparação', 'Hipérbole', 'Imparidade', 'Metáfora', 'Paradoxo', 'Vulgarismo', 'Outro', 'Sem Evidência']
#output = codecs.open('annotated_10_INST.txt','w','utf-8')
#output = codecs.open('annotation_stats/data.txt','wb','utf-8')
output = open('data_all.csv', 'wb')
csvw = unicodecsv.DictWriter(output, delimiter='\t', fieldnames=fieldnames)
final = dict()

filename = 'block_0_IRONIA'
for an in annotators:
    try:
        with open('express_precuration/annotation/' + filename + '.tcf/' + an +
                  '.tsv') as tsv:
            for line in csv.reader(tsv, dialect="excel-tab"):
                if len(line) > 0:
                    if line[0].startswith('#text='):
                        text = line[0].rsplit('[ [', 1)[0]
                        text = text.replace('#text=', '').decode('utf-8')
                        if not final.has_key(text):
                            origin = checkOrigin(text)
                            final[text] = {
Ejemplo n.º 3
0
    def get(self, request, number):
        """
        Creates a CSV for the order. The structure of the CSV looks like this:

           > Order Number:,EDX-100001

           > Seat in Demo with verified certificate (and ID verification)
           > Code,Redemption URL
           > J4HDI5OAUGCSUJJ3,ecommerce.server?code=J4HDI5OAUGCSUJJ3
           > OZCRR6WXLWGAFWZR,ecommerce.server?code=OZCRR6WXLWGAFWZR
           > 6KPYL6IO6Y3XL7SI,ecommerce.server?code=6KPYL6IO6Y3XL7SI
           > NPIJWIKNLRURYVU2,ecommerce.server?code=NPIJWIKNLRURYVU2
           > 6SZULKPZQYACAODC,ecommerce.server?code=6SZULKPZQYACAODC
           >

        Args:
            request (Request): The GET request
            number (str): Number of the order

        Returns:
            HttpResponse

        Raises:
            Http404: When an order number for a non-existing order is passed.
            PermissionDenied: When a user tries to download a CSV for an order that he did not make.

        """
        try:
            order = Order.objects.get(number=number)
        except Order.DoesNotExist:
            raise Http404('Order not found.')

        if request.user != order.user and not request.user.is_staff:
            raise PermissionDenied

        file_name = 'Enrollment code CSV order num {}'.format(order.number)
        file_name = '{filename}.csv'.format(filename=slugify(file_name))

        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename={filename}'.format(
                filename=file_name)

        redeem_url = get_ecommerce_url(reverse('coupons:offer'))
        voucher_field_names = ('Code', 'Redemption URL', 'Name Of Employee',
                               'Date Of Distribution', 'Employee Email')
        voucher_writer = csv.DictWriter(response,
                                        fieldnames=voucher_field_names)

        writer = csv.writer(response)
        writer.writerow(('Order Number:', order.number))
        writer.writerow([])

        order_line_vouchers = OrderLineVouchers.objects.filter(
            line__order=order)
        for order_line_voucher in order_line_vouchers:
            writer.writerow([order_line_voucher.line.product.title])
            voucher_writer.writeheader()

            for voucher in order_line_voucher.vouchers.all():
                voucher_writer.writerow({
                    voucher_field_names[0]:
                    voucher.code,
                    voucher_field_names[1]:
                    '{url}?code={code}'.format(url=redeem_url,
                                               code=voucher.code)
                })
            writer.writerow([])
        return response
def write_csv(outfile, fieldnames, data):
    with open(outfile, 'wb') as open_outfile:
        csvfile = csv.DictWriter(open_outfile, fieldnames)
        csvfile.writeheader()
        csvfile.writerows(data)
Ejemplo n.º 5
0
    final_data = []
    header_name = []
    for each_data in process:
        reduced = {}

        reduce_the_item(tool_name, each_data)

        header_name += reduced.keys()

        final_data.append(reduced)

    header_name = list(set(header_name))
    header_name.sort()

fpointer3.close()
'''
The contents of the collated json file are written into a csv file 
'''
with open(csvfilename, 'a+') as fpointer4:
    w = csv.DictWriter(fpointer4,
                       header_name,
                       quoting=csv.QUOTE_ALL,
                       encoding='utf-8')
    w.writeheader()
    for each_line in final_data:
        w.writerow(each_line)
'''
Success message is printed 
'''
print("The csv and the collated json are successfully generated !")
Ejemplo n.º 6
0
#!/usr/bin/env python

import unicodecsv
import sys
import codecs

FIELDNAMES = 'ref_number,name,title_en,title_fr,description_en,description_fr,start_date,end_date,employee_attendees,guest_attendees,location_en,location_fr,total,owner_org,owner_org_title'.split(',')

assert sys.stdin.read(3) == codecs.BOM_UTF8

in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8')

sys.stdout.write(codecs.BOM_UTF8)
out_csv = unicodecsv.DictWriter(sys.stdout, fieldnames=FIELDNAMES, encoding='utf-8')
out_csv.writeheader()

try:
    for line in in_csv:
        try:
            line['employee_attendees'] = str(int(line.pop('attendees')))
        except ValueError:
            line['employee_attendees'] = '0'
        line['guest_attendees'] = '0'
        out_csv.writerow(line)

except KeyError:
    if 'warehouse' in sys.argv:
        sys.exit(85)
    else:
        raise
Ejemplo n.º 7
0
def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)
Ejemplo n.º 8
0
 def test_encode_error_dictwriter(self):
     fd = BytesIO()
     dw = csv.DictWriter(fd, ['col1'],
                         encoding='cp1252', errors='xmlcharrefreplace')
     dw.writerow({'col1': chr(2604)})
     self.assertEqual(fd.getvalue(), b'&#2604;\r\n')
Ejemplo n.º 9
0
#encoding=utf-8

import django
django.setup()

import requests
from bs4 import BeautifulSoup, element
import unicodecsv as csv
import codecs

dict_list = []
for i in range(1, 257):
    url = u'http://www.nechama.org.il/commentatorsPopup/{}.html'.format(i)
    r = requests.get(url, )
    data = r.content
    content = BeautifulSoup(data, "lxml")
    title = content.find(attrs={'id': 'contentTop'}).get_text()
    text = content.find(attrs={'id': 'contentBody'}).get_text()
    dict_list.append({u'number': i, u'name': title, u'text': text})

with open('parshanim.csv', 'w') as csv_file:
    writer = csv.DictWriter(csv_file, [u'number', u'name', u'text'])
    writer.writeheader()
    writer.writerows(dict_list)

print "done"
Ejemplo n.º 10
0
 def __init__(self, data_file):
     self._csv_writer = unicodecsv.DictWriter(data_file, fieldnames=_REQUIRED_COLUMNS, lineterminator="\n")
     self._csv_writer.writeheader()
Ejemplo n.º 11
0
def _make_csv(abbr, name, fields):
    filename = '/tmp/{0}_{1}'.format(abbr, name)
    f = unicodecsv.DictWriter(open(filename, 'w'), fields)
    f.writerow(dict(zip(fields, fields)))
    return filename, f
Ejemplo n.º 12
0
def getWordLinks(args):

    print("Getting .docx Links")

    # Handle arguments and flags
    parser = argparse.ArgumentParser(usage=instructions, add_help=False)
    parser.add_argument("--help", "-h", action="store_true")
    parser.add_argument("-r", action="store_true")
    parser.add_argument("-l", action="store_true")
    parser.add_argument("-o", action="store")
    parser.add_argument("file_names", nargs="*")

    args = parser.parse_args(args)

    # Replace arguments with wildcards with their expansion.
    # If a string does not contain a wildcard, glob will return it as is.
    # Mostly important if we run this on Windows systems.
    file_names = list()

    for name in args.file_names:
        file_names += glob.glob(glob.escape(name))

    # If the filenames don't exist, say so and quit.
    if file_names == []:
        sys.exit("No file or directory found by that name.")

    # Don't run the script on itself.
    if sys.argv[0] in file_names:
        file_names.remove(sys.argv[0])

    if args.help:
        sys.exit(instructions)

    filecount = 0
    linklist = []
    target_is_folder = False

    for name in file_names:
        # Make sure single files exist.
        assert os.path.exists(name), "File or directory not found."

        # If it's just a file...
        if os.path.isfile(name):
            # Make sure this is a Word file (just check extension)
            if name.lower().endswith(".docx") or name.lower().endswith(
                    ".docm"):
                # Get links from that file.
                linklist.extend(getLinks(name, args, False))
                filecount += 1

        # If it's a directory:
        if os.path.isdir(name):
            target_is_folder = True
            # Recursive version using os.walk for all levels.
            if args.r:
                for dirpath, dirnames, files in os.walk(name):
                    for eachfile in files:
                        # Get links for every file in that directory.
                        if eachfile.lower().endswith(
                                ".docx") or eachfile.lower().endswith(".docm"):
                            linklist.extend(getLinks(eachfile, args, dirpath))
                            filecount += 1
            # Non-recursive version breaks os.walk after the first level.
            else:
                topfiles = []
                for (dirpath, dirnames, files) in os.walk(name):
                    topfiles.extend(files)
                    break
                for eachfile in topfiles:
                    if eachfile.lower().endswith(
                            ".docx") or eachfile.lower().endswith(".docm"):
                        linklist.extend(getLinks(eachfile, args, dirpath))
                        filecount += 1

    # When called by other scripts, quietly return the list and stop.
    if args.l:
        return linklist

    # Otherwise, output a file and print some info.
    print("\nChecked " + str(filecount) + " .docx file" +
          ("s" if filecount > 1 else "") + " for links.")

    # Create output file as sibling to the original target of the script.
    outFileName = args.o if args.o else "Word_Doc_Links.csv"
    if target_is_folder:
        outFileFolder = os.path.abspath(os.path.join(file_names[0], os.pardir))
        outFilePath = os.path.join(outFileFolder, outFileName)
    else:
        outFilePath = os.path.join(os.path.dirname(file_names[0]), outFileName)

    with open(outFilePath, "wb") as outputFile:
        fieldnames = ["filename", "href", "text"]

        writer = csv.DictWriter(outputFile,
                                fieldnames=fieldnames,
                                extrasaction="ignore")
        writer.writeheader()

        for row in linklist:
            writer.writerow(row)

    print("Spreadsheet created: " + outFileName)
    print("Location: " + outFilePath)
Ejemplo n.º 13
0
        except ValueError:
            pass
        else:
            raise ValueError

    for fmt in formats:
        try:
            return datetime.strptime(d, fmt)
        except ValueError:
            pass
    return from_excel(int(d))


in_csv = unicodecsv.DictReader(sys.stdin, encoding='utf-8')
out_csv = unicodecsv.DictWriter(sys.stdout,
                                fieldnames=FIELDNAMES,
                                encoding='utf-8')
out_csv.writeheader()

err_csv = None
original = None
line = None
if sys.argv[1:]:
    err_csv = unicodecsv.DictWriter(open(sys.argv[1], 'wb'),
                                    fieldnames=in_csv.fieldnames,
                                    encoding='utf-8')
    err_csv.writeheader()


def error(msg, value=None):
    sys.stderr.write(line['owner_org'] + ' ' + line['ref_number'] + ' ' + msg +
Ejemplo n.º 14
0
    if e['title'] == '"Lady Marmalade"' and "Christina" in e['artists']:
        row['title'] = '"Lady Marmalade (Moulin Rouge)"'
    prev_artist = e['artists']
    res_updated.append(row)

# Consolidate titles
res_by_title = defaultdict(dict)
for e in res_updated:
    row = {}
    row['artists'] = e['artists']
    row['title'] = e['title']
    row['entry_{0}'.format(e['entry'])] = e['date']
    row['weeks_{0}'.format(e['entry'])] = e['weeks']
    res_by_title[e['title'] + e['artists']].update(row)

# Return to list form
res_final = []
for key, value in res_by_title.items():
    res_final.append(value)

# Prepare fields for csv export.
csv_fields = [
    'title', 'artists', 'entry_1', 'entry_2', 'entry_3', 'weeks_1', 'weeks_2',
    'weeks_3'
]

# Export csv
with open('billboard_wiki.csv', 'wb') as f:
    writer = unicodecsv.DictWriter(f, csv_fields)
    writer.writeheader()
    writer.writerows(res_final)
        return properties_list


if __name__ == "__main__":
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter)
    argparser.add_argument('zipcode', help='')
    sortorder_help = """
    available sort orders are :
    newest : Latest property details,
    cheapest : Properties with cheapest price
    """
    argparser.add_argument('sort',
                           nargs='?',
                           help=sortorder_help,
                           default='Homes For You')
    args = argparser.parse_args()
    zipcode = args.zipcode
    sort = args.sort
    print("Fetching data for %s" % (zipcode))
    scraped_data = parse(zipcode, sort)
    print("Writing data to output file")
    with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
        fieldnames = [
            'title', 'address', 'city', 'state', 'postal_code', 'price',
            'facts and features', 'real estate provider', 'url'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in scraped_data:
            writer.writerow(row)
Ejemplo n.º 16
0
        continue

    source_id = arg
    posts = soup.findAll('div', attrs={'class': '_5pcr userContentWrapper'})

    output = "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><title>Extraction results</title></head>\n<body><table border=1 style='font-size:13px;border-collapse:collapse;table-layout:fixed;width:1300px;word-break:break-all'><tr><td style='width:30px'><center>#</center></td><td style='width:120px;'>Post id</td><td style='width:100px;'>Time_published</td><td style='width:100px;'>Author name</a></td><td style='width:100px;'>Author ID</td><td style='width:300px'>Post message</td><td style='width:45px'><center>Shared<br> as</center></td><td style='width:25px'><center>#<br>pics</center></td><td style='width:100px;'><center>Pics</center></td><td style='width:25px'><center>#<br>vids</center></td><td style='width:100px'><center>Vids</center></td><td style='width:30px'><center>#<br>links</center></td><td style='width:40px'><center>Links</center></td><td style='width:40px'><center>Reacts</center></td><td style='width:40px'><center>Like</center></td><td style='width:40px'><center>Love</center></td><td style='width:40px'><center>Haha</center></td><td style='width:40px'><center>Angry</center></td><td style='width:40px'><center>Sad</center></td><td style='width:40px'><center>Wow</center></td><td style='width:40px'><center>Shares</center></td><td style='width:40px'><center>Comments</center></td></tr>"

    with open(arg + ".csv", 'wb') as csvfile:
        fieldnames = [
            'source_id', 'post_id', 'post_url', 'created_time', 'author_name',
            'author_id', 'msg', 'shared_as', 'pic_count', 'pics', 'vid_count',
            'vids', 'link_count', 'links', 'reactions', 'like', 'love', 'haha',
            'angry', 'sad', 'wow', 'shares', 'comment_count'
        ]
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                delimiter=',',
                                lineterminator='\n')
        writer.writeheader()

    index = 0
    regex1 = re.compile('.*\.php\?id\=.+?\&.*')
    regex2 = re.compile('.*_5pbx.*')
    regex3 = re.compile('.*scaledImageFit.*')
    regex4 = re.compile('async.*')
    img_regex = re.compile('_s0 _4ooo _\d.*?_rw img')

    for post in posts:
        index = index + 1
        try:
            #print "trying "+str(index)
            post_id = post.find('div',
Ejemplo n.º 17
0
            'Hierarchies (count)':
            len(publisher_stats['hierarchies']),
            'Hierarchies':
            ';'.join(publisher_stats['hierarchies']),
        }


with open(os.path.join('out', 'publishers.csv'), 'w') as fp:
    writer = unicodecsv.DictWriter(fp, [
        'Publisher Name',
        'Publisher Registry Id',
        'Activities',
        'Organisations',
        'Files',
        'Activity Files',
        'Organisation Files',
        'Total File Size',
        'Reporting Org on Registry',
        'Reporting Orgs in Data (count)',
        'Reporting Orgs in Data',
        'Hierarchies (count)',
        'Hierarchies',
    ])
    writer.writeheader()
    for d in publisher_dicts():
        writer.writerow(d)

publishers = data.current_stats['inverted_publisher']['activities'].keys()

with open(os.path.join('out', 'elements.csv'), 'w') as fp:
    writer = unicodecsv.DictWriter(fp, ['Element'] + publishers)
Ejemplo n.º 18
0
def  runchart(coinObj):
    plt.xticks(rotation = 90)
    # xs = df['date'][:100]
    # print (xs)
    # ys = df['price'][:100]
    # print (ys)
    print (list(coinObj.keys())[0])
    xs = coinObj[list(coinObj.keys())[0]]['date'][:100]
    xs = pd.to_datetime(xs)
    #print(xs)
    #coinList = list(coinObj)
   # for idx, coinNM in enumerate(coinList):
    chart_idx = 1
    print('len',len(coinObj))
    # coinList = []
    # coinNmList = []

    for coinNM  in coinObj:
        # coninList = coinObj[coinNM]
        print (coinObj[coinNM])
        xs = coinObj[coinNM]['date']
        xst = pd.to_datetime(xs)
        ys = coinObj[coinNM]['price']

        coinObjlen = len(coinObj)

        #chart create
        subnum =  coinObjlen *100 +10 + chart_idx
        plt.subplot(subnum)
        plt.ylabel(coinNM)
        plt.xlabel('Exposure Time')
        plt.plot(xst, ys ,label=coinNM)

        plt.grid(True)
        plt.legend(loc='best', title=coinNM)
        chart_idx += 1

        meme_buy_count = 1;
        meme_sell_count = 1;
        memelist = []
        for idx, val in enumerate(ys):
            print (idx, val ,ys[idx])
            mobj = collections.OrderedDict()
            mobj['date'] =''
            mobj['price'] =''
            mobj['buy_volume'] = 0
            mobj['sell_volume'] = 0
            mobj['sum_buy'] = 0
            mobj['sum_sell'] = 0
            mobj['sum_volume'] = 0

            if idx < len(ys) - 3:
                # buy
                if ys[idx] > ys[idx + 1]:
                    if ys[idx + 1] > ys[idx + 2]:
                        if ys[idx + 2] > ys[idx + 3]:
                           # if not memelist or memelist[-1]['sell_volume'] ==1:
                              # print('b', meme_buy_count, str(idx - 1) + '>' + str(idx), ys[idx])
                                meme_buy_count += 1
                                plt.scatter(xst[idx], ys[idx], color='g', marker='^')
                                mobj['buy_volume'] = 1
                                mobj['sum_buy'] = ys[idx] * mobj['buy_volume']
         #               memelist.append(mobj)
                        #     coinObj[coinNM][idx]['buy'] = 1
                        # else:
                        #     coinObj[coinNM][idx]['buy'] = 0
                # sell
                if ys[idx] < ys[idx + 1]:
                    if ys[idx + 1] < ys[idx + 2]:
                        if ys[idx + 2] < ys[idx + 3]:
                         #   if memelist[-1]['buy_volume'] ==1:
                                print('s', meme_sell_count, str(idx - 1) + '>' + str(idx), ys[idx])
                                meme_sell_count += 1
                                plt.scatter(xst[idx], ys[idx], color='r', marker='v')
                                mobj['sell_volume'] = 1
                                if memelist:
                                    mobj['sum_sell'] = ys[idx] * mobj['sell_volume']


            mobj['date'] = xs[idx]
            mobj['price'] = ys[idx]
            if memelist :
               mobj['sum_volume']  =  memelist[-1]['sell_volume']+mobj['buy_volume']- mobj['sell_volume']
               #mobj['sell_volume']  =  memelist[-1]['sell_volume']+mobj['sell_volume']
               #mobj['buy_volume']  =  memelist[-1]['buy_volume']+ mobj['buy_volume']


            memelist.append(mobj)

        print(memelist[:100])
        keys = memelist[0].keys()
        with open('t/'+coinNM+'_trade.csv', 'wb') as output_file:
            dict_writer = csv.DictWriter(output_file, keys)
            dict_writer.writeheader()
            dict_writer.writerows(memelist)
    plt.xticks(rotation=90)
    plt.show()
Ejemplo n.º 19
0
    def handle(self, **options):
        output = options['output']
        from froide.publicbody.models import PublicBody
        from froide.foirequest.models import FoiRequest

        year = 2015

        def get_status(status):
            KNOWN_STATUS = (
                'successful',
                'partially_successful',
                'refused',
            )
            MAP_STATUS = {}
            if status in KNOWN_STATUS:
                return status
            return MAP_STATUS.get(status, 'other')

        def convert_value(val):
            if not val:
                return 0
            else:
                return int(val)

        def stats_for_queryset(qs, key=None):
            status_counter = Counter()
            for r in qs:
                arg = key
                if arg is None:
                    arg = r.public_body.name
                status_counter[get_status(r.status)] += 1
            return status_counter

        output = open(output, 'w')
        writer = unicodecsv.DictWriter(output,
            ('name', 'gb', 'year', 'total_count'), encoding='utf-8')
        writer.writeheader()

        short_names = [
            "BK",
            "BMAS",
            "AA",
            "BMI",
            "BMJV",
            "BMF",
            "BMWi",
            "BMEL",
            "BMVg",
            "BMFSFJ",
            "BMG",
            "BMVI",
            "BMUB",
            "BMBF",
            "BKM",
            "BMZ",
            "BPA",
            "BPräsA",
            "BT",
            "BR",
            "BBank",
            "BfDI",
            "BRH",
            'BVerfG'
        ]

        for year in range(2011, 2016):
            for short_name in short_names:
                print(short_name)
                try:
                    root_pb = PublicBody.objects.get(
                        jurisdiction_id=1,
                        other_names__contains='%s,' % short_name
                    )
                except PublicBody.DoesNotExist:
                    print('missing')
                    continue
                root_count = root_pb.foirequest_set.filter(first_message__year=year, is_foi=True).count()
                pbs = PublicBody.objects.filter(root=root_pb)
                qs = FoiRequest.objects.filter(first_message__year=year,
                      public_body__in=pbs, is_foi=True)
                total_count = len(list(qs))
                writer.writerow({
                    'name': short_name,
                    'year': year,
                    'gb': 'True',
                    'total_count': total_count,
                })
                writer.writerow({
                    'name': short_name,
                    'year': year,
                    'gb': 'False',
                    'total_count': root_count,
                })
Ejemplo n.º 20
0
 def write_headers(self):
     self.csv_writer = csv.DictWriter(self.fout, self._headers)
     self.csv_writer.writeheader()
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    f = open('test_data.csv','wb')
    w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\")
    w.writeheader()

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for k_i, (train, test) in enumerate(kf):

        if k_i == 1:
            break

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16)

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)



        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)




            y_df = clf.decision_function(X_test) # get distances from the decision boundary
            # positive distances = more likely to be relevant sentences

            r_len = len(y_preds)
            y_top3 = []
            y_top1 = []
            y_rand = []

            y_uids = np.array(docs.y_uids(uids[test], domain=domain))

            # import pdb; pdb.set_trace()

            for y_uid in np.unique(y_uids):

                mask = np.where(y_uids == y_uid)[0]
                doc_df = y_df[mask]

                doc_top3 = np.argpartition(doc_df, -3)[-3:]
                y_top3.extend(list(mask[doc_top3]))
                
                doc_top1 = np.argmax(doc_df)
                y_top1.append(mask[doc_top1])

                doc_rand = np.random.randint(0, len(doc_df))
                y_rand.append(mask[doc_rand])


            human_sent_indices = np.where(y_test==1)[0]
            algorithm_sent_indices = np.where(y_preds==1)[0]

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            # import pdb; pdb.set_trace()

            for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))):

                row = {"domain": domain,
                       "sent_text": doc,
                       "random": doc_i in y_rand,
                       "human": doc_i in human_sent_indices,
                       "algorithm": doc_i in algorithm_sent_indices,
                       "top3": doc_i in y_top3,
                       "top1": doc_i in y_top1,
                       "pmid": pmid}

                if row["random"] or row["human"] or row["top3"] or row["top1"]:
                    # please note, the sentences will only be included in the analysis if
                    # in the top1 or top3
                    # we do have data on whether the raw classifier has predicted yes/no
                    # 
                    # this in effect means where the classifier picks <= 3 sentences
                    # we use all raw classifier data
                    # where >3 sentences are predicted by raw classifier, only the
                    # top 3 are used; the rest are discarded
                    w.writerow(row)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    f.close()
Ejemplo n.º 22
0
#!/usr/bin/python
from lxml import etree
import unicodecsv as csv

departements = ['03', '15', '19', '23', '43', '63']
circos = ['01', '02', '03', '04', '05']
ficsv = open('lg017_propor_tour1.csv', 'w')

try:
    fieldnames = []
    fieldnames.extend([
        'circo', 'inscrits', 'votants', 'exprimes', 'abstentions', 'blancs',
        'nuls', 'EXG', 'COM', 'FI', 'SOC', 'RDG', 'DVG', 'ECO', 'DIV', 'REG',
        'REM', 'MDM', 'UDI', 'LR', 'DVD', 'DLF', 'FN', 'EXD'
    ])
    majcsv = csv.DictWriter(ficsv, fieldnames=fieldnames)
    majcsv.writeheader()
    for dep in departements:
        for circo in circos:
            try:
                arbre = etree.parse(
                    "http://elections.interieur.gouv.fr/telechargements/LG2017/resultatsT1/0"
                    + dep + "/0" + dep + circo + ".xml")
                print("dep OK")
                for noeud in arbre.xpath(
                        "//Election/Departement/Circonscription"):
                    objet = {}
                    for circ in noeud.xpath("CodCirLg"):
                        objet["circo"] = dep + circ.text
                    for resultats in noeud.xpath("Tours/Tour[NumTour=1]"):
                        for inscrits in resultats.xpath(
Ejemplo n.º 23
0
    print(file=sys.stderr)
    print('writing courses...', file=sys.stderr)
    # print(os)
    # print(oc)

    for oid, sids in os.iteritems():
        oname = get_offering_name(oid)
        for sid in sids:
            s = get_section(oid, sid)
            if not s:
                continue
            s['name'] = oname
            s['catalogs'] = ';'.join(oc[oid])
            yield s
        time.sleep(0.2)  # be a good boy; don't stress the server


if __name__ == '__main__':

    writer = unicodecsv.DictWriter(sys.stdout, [
        'id', 'name', 'location', 'start', 'end', 'day', 'time', 'cost',
        'credit', 'instructors', 'catalogs', 'link'
    ],
                                   delimiter='\t')
    writer.writeheader()

    for s in get_all_sections():
        writer.writerow(s)

    print('done', file=sys.stderr)
Ejemplo n.º 24
0
            status = "Destroyed"

        if status == "Created":
            location = line.split(" at ")[1].split(" created")[0]
        elif status == "Destroyed":
            location = line.split(" at ")[1].split(" destroyed ")[0]

        rally = {"Datetime": date_val, "Location": location, "Status": status}
        outdata.append(rally)
        #print(current_map)


keys = outdata[0].keys()

with open('rally_point_location.csv', 'wb') as out_file:
    dict_writer = csv.DictWriter(out_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(outdata)




# Get player kill info

#[2018.08.31-04.55.10:064][730]LogSquad: Player:[LLJK] ☢Riyott.exe☢ ActualDamage=186.000015 from oldstastic2011 caused by BP_M4_M68_C_20
#[2018.08.31-04.55.35:645][  2]LogSquad: ScorePoints: Points: -1.000000 ScoreEvent: TeamKilled Jordan Reagan

regex = r"(Player:.*) (ActualDamage=.*) from (.*) caused by (.*)"

kill_logs = []
Ejemplo n.º 25
0
def getWordLinks(args):

    # Handle arguments and flags
    parser = argparse.ArgumentParser(usage=instructions, add_help=False)
    parser.add_argument('--help', '-h', action='store_true')
    parser.add_argument('-r', action='store_true')
    parser.add_argument('-l', action='store_true')
    parser.add_argument('file_names', nargs='*')

    args = parser.parse_args(args)

    # Replace arguments with wildcards with their expansion.
    # If a string does not contain a wildcard, glob will return it as is.
    # Mostly important if we run this on Windows systems.
    file_names = list()

    for name in args.file_names:
        file_names += glob(name)

    # If the filenames don't exist, say so and quit.
    if file_names == []:
        sys.exit('No file or directory found by that name.')

    # Don't run the script on itself.
    if sys.argv[0] in file_names:
        file_names.remove(sys.argv[0])

    optionlist = []
    if args.help: sys.exit(instructions)
    if args.r: optionlist.append('r')
    if args.l: optionlist.append('l')

    filecount = 0
    linklist = []
    target_is_folder = False

    for name in file_names:
        # Make sure single files exist.
        assert os.path.exists(name), "File or directory not found."

        # If it's just a file...
        if os.path.isfile(name):
            # Make sure this is an sjson file (just check extension)
            if name.lower().endswith('.docx'):
                # Convert it to an SRT file
                linklist.extend(getLinks(name, optionlist, False))
                filecount += 1

        # If it's a directory:
        if os.path.isdir(name):
            target_is_folder = True
            # Recursive version using os.walk for all levels.
            if 'r' in optionlist:
                for dirpath, dirnames, files in os.walk(name):
                    for eachfile in files:
                        # Convert every file in that directory.
                        if eachfile.lower().endswith('.docx'):
                            linklist.extend(
                                getLinks(eachfile, optionlist, dirpath))
                            filecount += 1
            # Non-recursive version breaks os.walk after the first level.
            else:
                topfiles = []
                for (dirpath, dirnames, files) in os.walk(name):
                    topfiles.extend(files)
                    break
                for eachfile in topfiles:
                    if eachfile.lower().endswith('.docx'):
                        linklist.extend(getLinks(eachfile, optionlist,
                                                 dirpath))
                        filecount += 1

    # When called by other scripts, quietly return the list and stop.
    if 'l' in optionlist:
        return linklist

    # Otherwise, output a file and print some info.
    print('\nChecked ' + str(filecount) + ' .docx file' +
          ('s' if filecount > 1 else '') + ' for links.')

    # Create output file as sibling to the original target of the script.
    if target_is_folder:
        outFileFolder = os.path.abspath(os.path.join(file_names[0], os.pardir))
        outFilePath = os.path.join(outFileFolder, 'Word_Doc_Links.csv')
    else:
        outFilePath = os.path.join(os.path.dirname(file_names[0]),
                                   'Word_Doc_Links.csv')

    with open(outFilePath, 'wb') as outputFile:
        fieldnames = ['filename', 'url', 'linktext']

        writer = csv.DictWriter(outputFile,
                                fieldnames=fieldnames,
                                extrasaction='ignore')
        writer.writeheader()

        for row in linklist:
            writer.writerow(row)

    print 'Spreadsheet created: Word_Doc_Links.csv'
    print 'Location: ' + outFilePath
Ejemplo n.º 26
0
def writeCourseSheet(rootFileDir, rootFileName, course_dict, args):
    course_name = course_dict['name']
    if args.links: course_name += ' Links'
    course_name += '.tsv'

    outFileName = args.o if args.o else course_name

    # Create a "csv" file with tabs as delimiters
    with open(os.path.join(rootFileDir, outFileName), 'wb') as outputfile:
        fieldnames = [
            'chapter', 'sequential', 'vertical', 'component', 'type', 'url'
        ]

        # Include the XML if we're dealing with problems
        if args.problems:
            fieldnames.append('inner_xml')
        # Include video data if we're dealing with videos
        if args.links:
            fieldnames = fieldnames + ['href', 'linktext']
        # Include video data if we're dealing with videos
        if args.video:
            fieldnames = fieldnames + [
                'duration', 'sub', 'youtube', 'edx_video_id', 'upload_name'
            ]

        writer = csv.DictWriter(outputfile,
                                delimiter='\t',
                                fieldnames=fieldnames,
                                extrasaction='ignore')
        writer.writeheader()

        spreadsheet = fillInRows(courseFlattener(course_dict))
        for index, row in enumerate(spreadsheet):
            for key in row:
                spreadsheet[index][key] = spreadsheet[index][key]
        printable = []

        if args.all:
            printable = spreadsheet
        else:
            if args.links:
                printable += [
                    row for row in spreadsheet if row['type'] in
                    ['html', 'problem', 'xml', 'docx', 'pptx', 'xlsx']
                ]
            if args.html:
                printable += [
                    row for row in spreadsheet if row['type'] == 'html'
                ]
            if args.video:
                printable += [
                    row for row in spreadsheet if row['type'] == 'video'
                ]
            if args.problems:
                printable += [
                    row for row in spreadsheet if row['type'] == 'problem'
                ]

        for row in printable:
            if args.links:
                if row['href'] != '':
                    writer.writerow(row)
            else:
                writer.writerow(row)

        print('Spreadsheet created for ' + course_dict['name'] + '.')
        print('Location: ' + outFileName)
Ejemplo n.º 27
0
            print("location id not available")

    except:
        print("Failed to load locations")


if __name__ == "__main__":
    ''' eg-:python 1934_glassdoor.py "Android developer", "new york" '''

    argparser = argparse.ArgumentParser()
    argparser.add_argument('keyword', help='job name', type=str)
    argparser.add_argument('place', help='job location', type=str)
    args = argparser.parse_args()
    keyword = args.keyword
    place = args.place
    print("Fetching job details")
    scraped_data = parse(keyword, place)
    print("Writing data to output file")

    with open('%s-%s-job-results.csv' % (keyword, place), 'wb') as csvfile:
        fieldnames = ['Salary']
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                quoting=csv.QUOTE_ALL)
        writer.writeheader()
        if scraped_data:
            for data in scraped_data:
                writer.writerow(data)
        else:
            print("Your search for %s, in %s does not match any jobs" %
                  (keyword, place))
Ejemplo n.º 28
0
def create_neo4j_csv(results):
    """
    Create csv's for use by the neo4j import tool. Relies on create_neo4j_ functions
    output and transforms it to suitable format for automatic importing.
    Input: 
        - results: dic,
        json-style dictionary. Check create_neo4j_ function output for
        details
    Output:
        - None just saves the documents in the allocated path as defined
        in settings.yaml 
    """
    outpath = settings['out']['csv']['out_path']
    entities_nodes = None
    articles_nodes = None
    relations_edges = None
    entity_pmc_edges = None
    other_nodes = []
    other_edges = []
    for nodes in results['nodes']:
        if nodes['type'] == 'Entity':
            entities_nodes = nodes['values']
        elif nodes['type'] == 'Article':
            articles_nodes = nodes['values']
        else:
            other_nodes.extend(nodes['values'])
    for edges in results['edges']:
        if edges['type'] == 'relation':
            relations_edges = edges['values']
        elif edges['type'] == 'mention':
            entity_pmc_edges = edges['values']
        elif edges['type'] == 'NEW':
            other_edges.extend(edges['values'])

    dic_ = {
        'entities.csv': entities_nodes,
        'articles.csv': articles_nodes,
        'other_nodes.csv': other_nodes,
        'entities_pmc.csv':entity_pmc_edges, 
        'relations.csv':relations_edges,
        'other_edges.csv': other_edges
    }

    dic_fiels = {
        'entities.csv': ['id:ID', 'label', 'sem_types:string[]'],
        'articles.csv': ['id:ID', 'title', 'journal','sent_id:string[]'],
        'other_nodes.csv': ['id:ID'],
        'entities_pmc.csv':[':START_ID','score:float[]','sent_id:string[]', ':END_ID'], 
        'relations.csv':[':START_ID','subject_score:float[]','subject_sem_type:string[]',':TYPE','pred_type:string[]', 'object_score:float[]','object_sem_type:string[]','sent_id:string[]','negation:string[]',':END_ID'],
        'other_edges.csv':[':START_ID', ':TYPE', ':END_ID']
    }

    for k, toCSV in dic_.iteritems():
        if toCSV:
            keys = toCSV[0].keys()
            out = os.path.join(outpath, k)
            with open(out, 'wb') as output_file:
                time_log("Created file %s" % k)
                dict_writer = csv2.DictWriter(output_file, fieldnames=dic_fiels[k], encoding='utf-8')
                dict_writer.writeheader()
                dict_writer.writerows(toCSV)
    time_log('Created all documents needed')
Ejemplo n.º 29
0
def open_csv():
    sources = []
    curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, curr_source, prev_rows = None, 0, None, 0, 0, None, []
    with open("sefer_haagada.csv", "rb") as fin:
        csv = unicodecsv.DictReader(fin)
        for row in csv:
            # if len(sources) >= 30:
            #     break
            if len(row["sourceNum"]) > 0:
                if len(prev_rows) > 0:
                    sources += [
                        make_parsed_source(curr_chapter_name, curr_chapter_num,
                                           curr_topic_name, curr_topic_num,
                                           curr_source_num, prev_rows)
                    ]
                    prev_rows = []
            # update chapter
            new_chapter = update_chapter(row)
            if new_chapter:
                if new_chapter[1] != curr_chapter_num + 1:
                    print("{} <= {} {}".format(new_chapter[1],
                                               curr_chapter_num,
                                               curr_chapter_name))
                curr_chapter_name, curr_chapter_num = new_chapter
                curr_topic_num = 0
                curr_source_num = 0
            # update topic
            new_topic = update_topic(row)
            if new_topic:
                if new_topic[1] != curr_topic_num + 1:
                    print("{} <= {} {}".format(new_topic[1], curr_topic_num,
                                               curr_topic_name))
                curr_topic_name, curr_topic_num = new_topic
            # update source num
            if len(row["sourceNum"]) > 0:
                new_source_num = gematria(row["sourceNum"])
                if new_source_num != curr_source_num + 1:
                    print("yoyoyo {} <= {} {} -- {}".format(
                        new_source_num, curr_source_num, curr_topic_name,
                        curr_topic_num))
                curr_source_num = new_source_num

            prev_rows += [row]
        if len(prev_rows) > 0:
            sources += [
                make_parsed_source(curr_chapter_name, curr_chapter_num,
                                   curr_topic_name, curr_topic_num,
                                   curr_source_num, prev_rows)
            ]
    sources = [_f for _f in sources if _f]
    sources = disambiguate_all(sources)
    with open("parsed.csv", "wb") as fout:
        csv = unicodecsv.DictWriter(fout, [
            "chapter_name", "chapter_num", "topic_name", "topic_num",
            "source_num", "source", "commentary", "good_ref_list",
            "bad_ref_list", "ref_list"
        ])
        csv.writeheader()
        for s in sources:
            s["ref_list"] = ", ".join(
                [r.normal() for r in s.get("ref_list", [])])
            s["good_ref_list"] = ", ".join(
                [r.normal() for r in s.get("good_ref_list", [])])
            s["bad_ref_list"] = ", ".join(
                [r.normal() for r in s.get("bad_ref_list", [])])
        csv.writerows(sources)
    with open("topics.csv", "wb") as fout:
        unique_topics = [{
            "chapter_name": x["chapter_name"],
            "topic_name": x["topic_name"]
        } for x in reduce(
            lambda a, b: a + ([b] if (len(a) == 0 or a[-1]['topic_name'] != b[
                'topic_name']) else []), sources, [])]
        csv = unicodecsv.DictWriter(fout, ["chapter_name", "topic_name"])
        csv.writeheader()
        csv.writerows(unique_topics)
Ejemplo n.º 30
0
def test_dict_dict():
    new_file = "avodah_zarah_little_letters.csv"
    ein_parser.run2("az_collapsed", "avodah_zarah")
    comp_file = "avodah_zarah_done_jan18.csv"
    new = []
    comp = []
    new_has_segments = False
    comp_has_segments = False
    with open(new_file, 'r') as csvfile:
        file_reader = csv.DictReader(csvfile)
        if "Line" in file_reader.fieldnames:
            new_has_segments = True
        for i, row in enumerate(file_reader):
            if not row:
                continue
            else:
                new_dict = {
                    "EM": row["original"],
                    "Rambam": row["Rambam"],
                    "Semag": row["Semag"],
                    "TurShA": row["Tur Shulchan Arukh"]
                }
                if new_has_segments:
                    new_dict['segment'] = '{}.{}'.format(
                        row['Daf'], row['Line'])
                new.append(new_dict)
    with open(comp_file, 'r') as csvfile:
        file_reader = csv.DictReader(csvfile)
        if "Line" in file_reader.fieldnames:
            comp_has_segments = True
        for i, row in enumerate(file_reader):
            if not row:
                continue
            else:
                new_dict = {
                    "EM": row["original"],
                    "Rambam": row["Rambam"],
                    "Semag": row["Semag"],
                    "TurShA": row["Tur Shulchan Arukh"]
                }
                if comp_has_segments:
                    new_dict['segment'] = '{}.{}'.format(
                        row['Daf'], row['Line'])
                comp.append(new_dict)
    missmatch_cnt = 0
    with open(u'az_test_diff.csv', 'w') as csv_file:
        writer = csv.DictWriter(csv_file, [u'line', u'old', u'new', u'EM'])
        writer.writeheader()
        if new_has_segments:
            lineseg = "a['segment']"
        elif comp_has_segments:
            lineseg = "b['segment']"
        else:
            lineseg = "i"
        for i, (a, b) in enumerate(zip(new, comp)):
            # assert a == b
            for k in a.keys():
                if a[k] != b[k]:
                    writer.writerow({
                        u'line': eval(lineseg),
                        u'new': a[k],
                        u'old': b[k],
                        u'EM': a['EM']
                    })
                    missmatch_cnt += 1
    assert missmatch_cnt == 6