def write_rows(results, filepath): """Prints all the rows of data returned by the API. Args: results: The response returned from the Core Reporting API. filepath: The absolute path of the output file (CSV) """ if results.get('rows', []): f = open(filepath, 'w') names = return_column_headers(results) f.write('\t'.join(names) + '\tid\tcallNumber\n') # get ID from pagePath pgIdx = names.index('pagePath') total_count = len(results.get('rows')) for ii,row in enumerate(results.get('rows')): page_path = row[pgIdx] lcc = 'unknown' id_str = UP.parse_qs(UP.urlparse(page_path).query)['id'][0] # TODO: Exclude any pages that were requested as xml format? # First check if the item info is already in our own database item_obj = db.items.find_one({'uniqueid':id_str},{'call-number':True,'oclcnumber':True,'lcc':True}) # If not in database, need to grab info from library item view xml if item_obj is None: item_obj = item_view_xml.get_xml_object(id_str) # Store item object in database for future use db.items.save(item_obj) if 'lcc' in item_obj: # lcc gets added on later (not in originally returned xml) lcc = item_obj['lcc'] else: call_number = item_obj['call-number'] # Multiple call-numbers separated by '|' first_call_number = call_number.split('|')[0] lcc = first_call_number.split(' ')[0] # If call number doesn't fit the right pattern # then search the OCLC Classify service and pull the LCC category from there lcc_match = lcc_re.match(lcc) if lcc_match is None: if 'oclcnumber' in item_obj: oclc_str = item_obj['oclcnumber'] lcc = oclc_classify_xml.get_lcc_class(oclc_str) # Test one more time and blank out if not right if lcc is None: lcc = 'unknown' else: lcc_match = lcc_re.match(lcc) if lcc_match is None: lcc = 'unknown' else: lcc = 'unknown' # Store in db for future use db.items.update({'uniqueid':id_str},{'$set':{'lcc':lcc}}) print ii, '/', total_count, lcc f.write('\t'.join(row) + '\t' + id_str + '\t' + lcc + '\n') else: print 'No Rows Found'
def rows_to_mongo(results, date_str): """Prints all the rows of data returned by the API. Args: results: The response returned from the Core Reporting API. filepath: The absolute path of the output file (CSV) """ if results.get('rows', []): names = return_column_headers(results) # get ID from pagePath pgIdx = names.index('pagePath') total_count = len(results.get('rows')) for ii,row in enumerate(results.get('rows')): page_view = {} date_hour_str = date_str + row[names.index('hour')] timestamp = datetime.strptime(date_hour_str, '%Y-%m-%d%H') for jj in range(len(row)): if names[jj] == 'latitude' or names[jj] == 'longitude': if 'loc' not in page_view: # spherical geospatial search and GeoJSON assume [longitude,latitude] ordering # and two-element list is MongoDB's recommended format for 2d locations page_view['loc'] = [float(row[names.index('longitude')]), float(row[names.index('latitude')])] elif names[jj] == 'hour': page_view['timestamp'] = timestamp elif names[jj] == 'visitors': page_view[names[jj]] = int(row[jj]) else: page_view[names[jj]] = row[jj] page_path = row[pgIdx] lcc = 'unknown' parsed_query = UP.parse_qs(UP.urlparse(page_path).query) # NOTE: Exclude any pages that were requested as xml format !!! if ('output-format' in parsed_query) and (parsed_query['output-format'] == 'xml'): continue if ('id' not in parsed_query): continue id_str = parsed_query['id'][0] page_view['uniqueid'] = id_str # First check if the item info is already in our own database item_obj = db.items.find_one({'uniqueid':id_str},{'call-number':True,'oclcnumber':True,'lcc':True}) # If not in database, need to grab info from library item view xml if item_obj is None: item_obj = item_view_xml.get_xml_object(id_str) # Once in a while get back an empty object from library page if len(item_obj) == 0: # NOTE: Not counting page view event if can't find page item XML !!! continue else: # Store item object in database for future use db.items.save(item_obj) if 'lcc' in item_obj: # lcc gets added on later (not in originally returned xml) lcc = item_obj['lcc'] else: if 'call-number' in item_obj: call_number = item_obj['call-number'] # Multiple call-numbers separated by '|' first_call_number = call_number.split('|')[0] lcc = first_call_number.split(' ')[0] # If call number doesn't fit the right pattern # then search the OCLC Classify service and pull the LCC category from there lcc_match = lcc_re.match(lcc) if lcc_match is None: if 'oclcnumber' in item_obj: oclc_str = item_obj['oclcnumber'] lcc = oclc_classify_xml.get_lcc_class(oclc_str) # Test one more time and blank out if not right if lcc is None: lcc = 'unknown' else: lcc_match = lcc_re.match(lcc) if lcc_match is None: lcc = 'unknown' else: lcc = 'unknown' # no call number, either else: lcc = 'unknown' # Store in db for future use db.items.update({'uniqueid':id_str},{'$set':{'lcc':lcc}}) print date_str, ' : ', ii, '/', total_count, lcc page_view['lcc'] = lcc lcc_match = lcc_re.match(lcc) if lcc_match is not None: page_view['lcc_category'] = lcc_match.group(1) page_view['lcc_first_letter'] = lcc_match.group(1)[:1] # Not creating own unique _id field, so check for date + hour & page match before saving pv_obj = db.pageviews.find_one({'uniqueid':id_str,'timestamp':timestamp},{'_id':True}) if pv_obj is None: db.pageviews.save(page_view, safe=True) else: print 'No Rows Found'