Example #1
0
def getMatch(url, i):
    req = urllib2.Request(url + str(i), headers={'User-Agent' : "scraping bot"})
    con = urllib2.urlopen(req)
    print(url + str(i) + " " + str(con.getcode()))
    html = con.read()
    soup = BeautifulSoup(html, 'html.parser')
    if len(soup.select("body > main > section > h1")) > 0 and soup.select("body > main > section > h1")[0].text == '404':
        print ("404, matchid = " + str(i))
        with open(csvpath, 'ab') as csvfile:
            writer = unicodecsv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow([i, "404"])
        return
    date = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(1) > div:nth-of-type(3)")[0]
    date = date['title'] + " " + date.text.strip()
    bo = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(1) > div:nth-of-type(2)")[0].text
    team_a = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(1) > span > b")[0].text
    team_b = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(2) > span > b")[0].text
    team_a_pct = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(1) > span")[0].i.text
    team_b_pct = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(2) > span")[0].i.text
    team_a_odds = re.findall(r'(\d+(\.\d+)? (to \d+(\.\d+)? )?for 1)',str(soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div.full")[0].select('div.half')[0].find('div')))[0][0]
    team_b_odds = re.findall(r'(\d+(\.\d+)? (to \d+(\.\d+)? )?for 1)',str(soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div.full")[0].select('div.half')[1].find('div')))[0][0]
    team_a_won = "(win)" in team_a.lower()
    team_b_won = "(win)" in team_b.lower()
    status = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(2)")[0]
    if status.has_attr('class') and 'full' in status['class']:
        status = ""
    else:
        status = status.text.strip()
    winner = "a" if team_a_won else "b" if team_b_won else "none"

    with open(csvpath, 'ab') as csvfile:
        writer = unicodecsv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow([i, date, bo, team_a, team_b, team_a_pct, team_b_pct, team_a_odds, team_b_odds, winner, status])
Example #2
0
def import_raw_xlsx():
    fields = [
        'la_code',
        'ba_ref',
        'prop_empty:boolean',
        'prop_empty_date:date~make_date_YYYY_MM_DD',
        'prop_occupied:boolean',
        'prop_occupied_date:date',
        'prop_ba_rates:numeric',
        'tenant',
    ]

    files = [f for f in listdir(DIR) if isfile(join(DIR, f))]


    with open('vacancy_errors.csv', 'w') as fe:
        with open('vacancy.csv', 'w') as fp:
            a = csv.writer(fp, delimiter=',')
            e = csv.writer(fe, delimiter=',')
            a.writerows([fields])
            e.writerows([['file', 'line', 'error'] + fields])
            for f in files:
                out, errors = process(f)
                a.writerows(out)
                e.writerows(errors)
def process_raw_msg(raw_msg, formatted_output_file, append=True):
    """
    Given a Python list of raw messages and an output CSV file
    to write to, write details of the messages out to the CSV
    file in the format:
        <sender-domain>,<subject>,<message-text>
    """
    if append:
        mode = "ab"
    else:
        mode = "wb"
    mime_msg = email.message_from_string(raw_msg)
    text = remove_punctuation(html_to_text(concat_email_text(mime_msg)))
    subject = mime_msg.get("Subject")
    # Decode escaped character sets in the subject line
    subject = u" ".join([a[0].decode('utf-8', 'replace')
                         for a in email.header.decode_header(subject)])
    subject = remove_punctuation(subject.replace("\r", " ").replace("\n", " "))
    sender_domain = mime_msg.get("From").split("@")[1].split(">")[0]#\
                                                      #.decode("utf-8")
    # Strip whitespace
    csv_line = [fix_spaces_cr_lf(s) for s in [sender_domain, subject, text]]
    # If any of our strings are empty, replace with a placeholder
    # to make sure each CSV line has three items.
    csv_line = map(lambda s: (u'' == s) and u"PLACEHOLDERNONE" or s ,
                   csv_line)
    if formatted_output_file == "STDOUT":
        writer = unicodecsv.writer(sys.stdout,
                                 quoting=unicodecsv.QUOTE_ALL)
        writer.writerow(csv_line)
    else:
        with open(formatted_output_file, mode) as handle:
            writer = unicodecsv.writer(handle,
                                   quoting=unicodecsv.QUOTE_ALL)
            writer.writerow(csv_line)
def processData():
    global manualIgnoreRecords
    global yesIgnoreRecords
    global manualProcessedRecords		
    global yesProcessedRecords		
    	
    dirpath = parentdir + "/R3_profiles_YNNM_raw/" 
    with open(dirpath + 'MANUAL_RAW.csv', 'r') as infile, open(processeddir + 'MANUAL_PROCESSED.csv', 'ab') as outfile:
	rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8')
	writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8')
	for row in rows:
	    if(row[6] in manual_ignore_list): #Ignore it
		manualIgnoreRecords += 1
		continue
	    else:
		manualProcessedRecords += 1
		writer.writerow(row)

    with open(dirpath + 'YES_RAW.csv', 'r') as infile, open(processeddir + 'YES_PROCESSED.csv', 'ab') as outfile:
	rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8')
	writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8')	
	for row in rows:
	    if(row[6] in yes_ignore_list): #Ignore it	
		yesIgnoreRecords += 1
		continue
	    else:
		yesProcessedRecords
		writer.writerow(row)
def csvlist_to_string(csvlist):
    f = StringIO()
    csv.writer(f, quoting=csv.QUOTE_MINIMAL,
               encoding='utf-8').writerow(csvlist)
    string = f.getvalue()
    f.close()
    return string
Example #6
0
def export_as_csv(queryset, fields=None, header=None, filename=None, options=None, out=None):
    """
        Exports a queryset as csv from a queryset with the given fields.

    :param queryset: queryset to export
    :param fields: list of fields names to export. None for all fields
    :param header: if True, the exported file will have the first row as column names
    :param filename: name of the filename
    :param options: CSVOptions() instance or none
    :param: out: object that implements File protocol. HttpResponse if None.

    :return: HttpResponse instance
    """
    if out is None:
        if filename is None:
            filename = filename or "%s.csv" % queryset.model._meta.verbose_name_plural.lower().replace(" ", "_")
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment;filename="%s"' % filename.encode('us-ascii', 'replace')
    else:
        response = out

    if options is None:
        config = csv_options_default
    else:
        config = csv_options_default.copy()
        config.update(options)

    if fields is None:
        fields = [f.name for f in queryset.model._meta.fields]

    dialect = config.get('dialect', None)
    if dialect is not None:
        writer = csv.writer(response, dialect=dialect)
    else:
        writer = csv.writer(response,
                            escapechar=str(config['escapechar']),
                            delimiter=str(config['delimiter']),
                            quotechar=str(config['quotechar']),
                            quoting=int(config['quoting']))

    if bool(header):
        if isinstance(header, (list, tuple)):
            writer.writerow(header)
        else:
            writer.writerow([f for f in fields])

    for obj in queryset:
        row = []
        for fieldname in fields:
            value = get_field_value(obj, fieldname)
            if isinstance(value, datetime.datetime):
                value = dateformat.format(value, config['datetime_format'])
            elif isinstance(value, datetime.date):
                value = dateformat.format(value, config['date_format'])
            elif isinstance(value, datetime.time):
                value = dateformat.format(value, config['time_format'])
            row.append(smart_str(value))
        writer.writerow(row)

    return response
Example #7
0
    def export_list(self, queryset):
        headers = [
            'Mobile', 'Text', 'Direction', 'Created', 'Delivered'
        ]
        output = StringIO()
        writer = unicodecsv.writer(output, encoding='utf-8')
        writer.writerow([unicode(i) for i in headers])
        yield output.getvalue()
        output.close()

        for message in queryset:
            # limit to three numbers for export and pad if less than three
            record = [
                message.sender if message.direction == 'IN'
                else message.recipient,
                message.text,
                message.direction,
                message.received.strftime('%Y-%m-%d %H:%M:%S')
                if message.received else '',
                message.delivered.strftime('%Y-%m-%d %H:%M:%S')
                if message.delivered else ''
            ]

            output = StringIO()
            writer = unicodecsv.writer(output, encoding='utf-8')
            writer.writerow([unicode(i) for i in record])
            yield output.getvalue()
            output.close()
Example #8
0
def xml2csv(file):
	
	# open file, parse it and get root
	tree = et.parse(file)
	root = tree.getroot()

	# setup csv writer for our data output
	csvwriter = csv.writer(sys.stdout)
	# print header row
	#csvwriter.writerow( [ ['CVE-ID'], ['CVSS Risk'], ['Summary'] ] )

	for entry_node in tree.xpath('*[local-name()="entry"]'):

		# Declare and initialise variables for use.
		vuln_id = "Unknown"
		vuln_score = "Unknown"
		vuln_summary = "Unknown"
	
		# get cve_id value
		vuln_id = entry_node.get('id')

		# get vuln summary
		index = len(entry_node.getchildren())
		summarynode = entry_node.getchildren()[index-1]
		vuln_summary = summarynode.text

		# get cvss risck score
		for n in entry_node.getchildren():
			if n.tag.find('cvss') != -1:
				cvss_node = n
				vuln_score = cvss_node.getchildren()[0].getchildren()[0].text

		row = [ vuln_id, vuln_score, vuln_summary ]
		csvwriter = csv.writer(sys.stdout)
		csvwriter.writerows( [row] )
Example #9
0
def augment_csv(src_csv,dest_csv):
    data = load(src_csv,with_keys=True)
    master_description = master_table(data[1:],DESCRIPTION)
    con_description = confidence_table(master_description)
    master_tags = master_table(data[1:],TAGS)
    con_tags = confidence_table(master_tags)
    master_title = master_table(data[1:],TITLE)
    con_title = confidence_table(master_title)
    data[0].append('guess_description')
    data[0].append('confidence')
    data[0].append('guess_tags')
    data[0].append('confidence')
    data[0].append('guess_title')
    data[0].append('confidence')
    for n in range(1,len(data)):
        theme,confidence = guess_for_row(data[n],con_description)
        data[n].append(theme)
        data[n].append(confidence)
        theme,confidence = guess_for_row(data[n],con_tags)
        data[n].append(theme)
        data[n].append(confidence)
        theme,confidence = guess_for_row(data[n],con_title)
        data[n].append(theme)
        data[n].append(confidence)
    with open(dest_csv,'w') as f:
        unicodecsv.writer(f).writerows(data)
def writerawresults(data, columns, placeholderurl, filename):
    csvoutfile = open(sys.argv[2] + '.csv', 'wb')
    datawriter = csv.writer(csvoutfile, delimiter=',')

    #Not needed in the long term. This was for comparing the file-finding capabilities of
    #of different methods
    csvfilesoutfile = open(sys.argv[2]+'.files.csv', 'wb')
    filesdatawriter = csv.writer(csvfilesoutfile, delimiter=',')
    
    row = []
    extraitems = ['format', 'geo', 'groups', 'tags']
    row.extend(extraitems);
    columnsoffset = len(extraitems)
    
    for column in columns:
        row.append(column)

    datawriter.writerow(row)
    
    for package in data:
        row = []
    
        #All files, for analysis
        dict_string = package['data_dict']
        json_dict = json.loads(dict_string)
        for resource in json_dict['resources']:
            if 'url' in resource:
                frow = []
                frow.append(resource['url'])
                filesdatawriter.writerow(frow)
    
        #Get resource formats
        if ('res_format' in package):
            [text, geo] = processListOfFormats(package['res_format'])
            row.extend([text, geo])
        else:
            row.extend('','')
    
        groups = u''
        tags = u''

        if 'groups' in package:
            row.append(arraytocsv(package['groups']))

        if 'tags' in package:
            row.append(arraytocsv(package['tags']))

        for column in columns:
            if column in package: 
                row.append(package[column])
            else:
                row.append('')

        if row[columns.index('url') + columnsoffset] == '':
            row[columns.index('url') + columnsoffset] = placeholderurl + row[columns.index('id') + columnsoffset]    
        datawriter.writerow(row)
    
    csvoutfile.close();
    csvfilesoutfile.close();
    def __clean_and_fill_csv(self, file_path, headers, data_list):
        myfile = open(file_path, 'wb')
        wr = unicodecsv.writer(myfile, quoting=csv.QUOTE_NONE, delimiter = '\t')
        wr.writerow(headers)

        myfile = open(file_path, 'ab')
        wr = unicodecsv.writer(myfile, quoting=csv.QUOTE_NONE, delimiter = '\t')
        wr.writerows(data_list)
Example #12
0
def fetch_rolling():
    output = []
    min_date = connection.execute(text('SELECT MIN(date) FROM races')).first()[0].replace(day=1)
    for begin_date in list(rrule.rrule(rrule.MONTHLY, dtstart=min_date, until=date.today())):
        end_date = begin_date + relativedelta.relativedelta(months=6)
        sql = 'SELECT AVG(avg), STDDEV(avg), AVG(dev) FROM (SELECT AVG(ranking) avg, STDDEV(ranking) dev FROM rankings WHERE rank_date BETWEEN :begin_date AND :end_date GROUP BY _driver) avg'
        result = connection.execute(text(sql), begin_date=begin_date, end_date=end_date).first()
        output.append([end_date.strftime('%Y-%m')] + result.values())
    unicodecsv.writer(open('charts/rolling_averages.csv', 'w')).writerows(output)
 def _split_batch_csv(self, data_file, f_values, f_ids):
     writer_values = unicodecsv.writer(f_values)
     writer_ids = unicodecsv.writer(f_ids)
     for row in unicodecsv.reader(data_file):
         writer_values.writerow(row[1:])
         writer_ids.writerow([row[:1]])
     f_values.seek(0)
     f_ids.seek(0)
     return f_values, f_ids
Example #14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        description="Parse program execution entries from the Amcache.hve Registry hive")
    parser.add_argument("registry_hive", type=str,
                        help="Path to the Amcache.hve hive to process")
    parser.add_argument("-v", action="store_true", dest="verbose",
                        help="Enable verbose output")
    parser.add_argument("-t", action="store_true", dest="do_timeline",
                        help="Output in simple timeline format")
    args = parser.parse_args(argv[1:])

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if sys.platform == "win32":
        import os, msvcrt
        msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
        
    r = Registry.Registry(args.registry_hive)

    try:
        ee = parse_execution_entries(r)
    except NotAnAmcacheHive:
        g_logging.error("doesn't appear to be an Amcache.hve hive")
        return

    if args.do_timeline:
        entries = []
        for e in ee:
            for t in ["first_run", "created_timestamp", "modified_timestamp",
                    "modified_timestamp2", "linker_timestamp"]:
                ts = getattr(e, t)
                if ts == UNIX_TIMESTAMP_ZERO:
                    continue
                if ts == WINDOWS_TIMESTAMP_ZERO:
                    continue
                if ts == datetime.datetime.min:
                    continue

                entries.append(TimelineEntry(ts, t, e))
        w = unicodecsv.writer(sys.stdout, delimiter="|", quotechar="\"",
                              quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8")
        w.writerow(["timestamp", "timestamp_type", "path", "sha1"])
        for e in sorted(entries, key=lambda e: e.timestamp):
            w.writerow([e.timestamp, e.type, e.entry.path, e.entry.sha1])
    else:
        w = unicodecsv.writer(sys.stdout, delimiter="|", quotechar="\"",
                              quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8")
        w.writerow(map(lambda e: e.name, FIELDS))
        for e in ee:
            w.writerow(map(lambda i: getattr(e, i.name), FIELDS))
Example #15
0
def createTrainTestFiles(inputFileName,folderName,trainSize,testSize,totalSamples=6034196,tags=None):
    if not os.path.exists('data/'+folderName):
        os.makedirs('data/'+folderName)
    writerSampleTrain = unicodecsv.writer(open('data/'+folderName +'/TrainSamples.csv', 'w'))
    writerSampleTest = unicodecsv.writer(open('data/'+folderName +'/TestSamples.csv', 'w'))
    readerSample = unicodecsv.reader(open('data/parsed/'+inputFileName+'Samples.csv', 'r'))
    
    writerIdsTrain = open('data/'+folderName +'/TrainIds', 'w')
    writerIdsTest = open('data/'+folderName +'/TestIds', 'w')
    readerIds = open('data/parsed/'+inputFileName+'Ids', 'r')
    
    writerTagsTrain = open('data/'+folderName +'/TrainTags', 'w')
    writerTagsTest = open('data/'+folderName +'/TestTags', 'w')
    readerTags = open('data/parsed/'+inputFileName+'Tags', 'r')
    

    if trainSize>=totalSamples:
        trainSize = totalSamples-testSize

    i = 0
    trainCount=0
    testCount=0
    for rowSample,rowId,rowTags in izip(readerSample,readerIds,readerTags):
        i+=1
        if tags is not None:
            toContinue = False
            for t in rowTags.split():
                if t not in tags:
                    toContinue = True
                    break
            if toContinue:
                continue
        x = randint(0,totalSamples)
        if x<testSize:
            writerSampleTest.writerow(rowSample)
            writerIdsTest.write(rowId)
            writerTagsTest.write(rowTags)
            testCount+=1
        else:
            if trainCount<trainSize:
                writerSampleTrain.writerow(rowSample)
                writerIdsTrain.write(rowId)
                writerTagsTrain.write(rowTags)
                trainCount+=1
            elif randint(0,5)<2:
                writerSampleTest.writerow(rowSample)
                writerIdsTest.write(rowId)
                writerTagsTest.write(rowTags)
                testCount+=1 
        if testCount>=testSize and trainCount>=trainSize:
            break
    
    print 'train : ' + str(trainCount)
    print 'test : ' + str(testCount)
    print 'total : ' + str(i)
Example #16
0
 def writerow(self, row):
     row = [str(s) for s in row]
     try:
         queue = StringIO()
         self.writer = csv.writer(queue, lineterminator="")
         self.writer.writerow(row)
     except TypeError:
         queue = BytesIO()
         self.writer = csv.writer(queue, lineterminator="")
         self.writer.writerow(row)
     return queue.getvalue()
Example #17
0
    def process_tweets(self, dtype):
        self.fuser = open('users.csv', 'wb')
        self.ft = None
        self.csv_u = unicodecsv.writer(self.fuser, delimiter=',')

        if dtype == 'json':
            f = open('csm-tweets-nairobi.json', 'rb')
            raw_tweets = f.readlines()
            fjson_tweets = []
            count = 1
            print 'loading tweets from fake json\n'
            for raw_tweet in raw_tweets:
                tweet = ast.literal_eval(raw_tweet.rstrip("\n"))

                if count % 50000 == 1:
                    if self.ft is not None:
                        self.ft.close()
                    self.ft= open('tweets_%d.csv'%(count/10000+1), 'wb')
                    self.csv_t = unicodecsv.writer(self.ft, delimiter=',')

                self.createDicts(tweet)
                if count % 50000 == 0:
                    print '%d tweets processed!\n'%(count)
                    self.batchInsertDicts()

                count += 1

            self.insertMentions()
            f.close()

        elif dtype == 'pickle':
            count = 1
            for fn in glob.glob("*.pickle"):
                f = open(fn, 'rb')
                tweets = pickle.load(f)
                print 'tweets extracted from pickle!\n'
                for tweet in tweets:
                    if count % 50000 == 1:
                        if self.ft is not None:
                            self.ft.close()
                        self.ft= open('tweets_%d.csv'%(count/10000+1), 'wb')
                        self.csv_t = csv.writer(self.ft, delimiter=',')
                    self.createDicts(tweet)
                    count += 1

                self.batchInsertDicts()
                print 'another pickle processed!\n'
                f.close()

            self.insertMentions()

        else:
            print 'wrong data type!'
            return None
Example #18
0
    def export_list(self, queryset):
        headers = [
            u'Participant ID', u'Name', u'Partner', u'Role',
            u'Location ID', u'Supervisor ID', u'Gender', u'Email', u'Password',
            u'Phone Primary', u'Phone Secondary #1', u'Phone Secondary #2'
        ]

        if queryset.count():
            location_types = LocationType.objects(
                is_administrative=True, deployment=queryset.first().deployment)
            headers = headers[:5] + \
                map(lambda location_type: location_type.name, location_types) + \
                headers[5:]
            for extra_field in queryset.first().deployment.participant_extra_fields:
                headers.append(extra_field.label)

        output = StringIO()
        writer = unicodecsv.writer(output, encoding='utf-8')
        writer.writerow([unicode(i) for i in headers])
        yield output.getvalue()
        output.close()

        for participant in queryset:
            # limit to three numbers for export and pad if less than three
            phone_numbers = [i.number for i in participant.phones][:3]
            phone_numbers += [''] * (3 - len(phone_numbers))

            record = [
                participant.participant_id if participant.participant_id
                else '',
                participant.name if participant.name else '',
                participant.partner.name if participant.partner else '',
                participant.role.name if participant.role else '',
                participant.location.code if participant.location else ''] + \
                [participant.location_name_path.get(
                 location_type.name, '')
                 for location_type in location_types] + \
                [participant.supervisor.participant_id if participant.supervisor
                else '',
                participant.gender if participant.gender else '',
                participant.email if participant.email else '',
                participant.password if participant.password else '',
            ]

            record.extend(phone_numbers)

            for extra_field in participant.deployment.participant_extra_fields:
                record.append(getattr(participant, extra_field.name, ''))

            output = StringIO()
            writer = unicodecsv.writer(output, encoding='utf-8')
            writer.writerow([unicode(i) for i in record])
            yield output.getvalue()
            output.close()
Example #19
0
    def __init__(self, *args, **kwargs):
        self.dataset_file = tempfile.NamedTemporaryFile(delete=False)
        self.resource_file = tempfile.NamedTemporaryFile(delete=False)

        self.dataset_csv = csv.writer(self.dataset_file)
        self.resource_csv = csv.writer(self.resource_file)

        self.dataset_filename = self.dataset_file.name
        self.resource_filename = self.resource_file.name

        self.organization_cache = {}

        self.keys = []
Example #20
0
    def export_list(self, queryset):
        headers = []
        location_types = list(LocationTypesService().find().order_by("ancestors_ref"))
        for location_type in location_types:
            location_name = location_type.name.upper()
            headers.append("{}_N".format(location_name))
            headers.append("{}_ID".format(location_name))
            if location_type.has_political_code:
                headers.append("{}_PCODE".format(location_name))
            if location_type.has_registered_voters:
                headers.append("{}_RV".format(location_name))
            for metafield in location_type.metafields:
                headers.append("{}_{}".format(location_name, metafield.upper()))

        output = StringIO()
        writer = unicodecsv.writer(output, encoding="utf-8")
        writer.writerow([unicode(i) for i in headers])
        yield output.getvalue()
        output.close()

        if queryset.count() < 1:
            yield
        else:
            locations = queryset
            locations = locations.order_by("code")
            for location in locations:
                record = []
                for location_type in location_types:
                    try:
                        this_location = filter(
                            lambda l: l.location_type == location_type.name, location.ancestors_ref
                        ).pop()
                    except IndexError:
                        if location.location_type == location_type.name:
                            this_location = location
                        else:
                            this_location = None
                    record.append(this_location.name or "" if this_location else "")
                    record.append(this_location.code or "" if this_location else "")
                    if location_type.has_political_code:
                        record.append(this_location.political_code or "" if this_location else "")
                    if location_type.has_registered_voters:
                        record.append(this_location.registered_voters or "" if this_location else "")
                    for metafield in location_type.metafields:
                        record.append(getattr(this_location, metafield, "") if this_location else "")

                output = StringIO()
                writer = unicodecsv.writer(output, encoding="utf-8")
                writer.writerow([unicode(i) for i in record])
                yield output.getvalue()
                output.close()
Example #21
0
File: csv.py Project: kball/ambry
    def _init_writer(self, row):
        from sqlalchemy.engine.result import RowProxy
        # Four cases:
        #    Write header, or don't
        #    Write list, or dict

        row_is_dict = isinstance(row, dict) or isinstance(row, RowProxy)
        row_is_list = isinstance(row, (list, tuple))


        has_header = self.header is not None

        if not os.path.exists(self.path):
            if not os.path.exists(os.path.dirname(self.path)):
                os.makedirs(os.path.dirname(self.path))

        f = open(self.path, 'wb', buffering=self.buffer_size)
        
        self._f = f

        delimiter = self.delimiter
        
        if row_is_dict and has_header:
            self._writer = unicodecsv.DictWriter(f, self.header, delimiter=delimiter, 
                                                 escapechar=self.escapechar, encoding=self.encoding)
            if self.write_header:
                self._writer.writeheader()
            self._inserter = self._write_dict
            
        elif row_is_dict and not has_header:
            self.header = row.keys()
            self._writer = unicodecsv.DictWriter(f, self.header, delimiter=delimiter, 
                                                 escapechar=self.escapechar, encoding=self.encoding)
            if self.write_header:
                self._writer.writeheader()            
            self._inserter = self._write_dict
            
        elif row_is_list and has_header:
            self._writer = unicodecsv.writer(f, delimiter=delimiter, 
                                             escapechar=self.escapechar, encoding=self.encoding)
            if self.write_header:
                self._writer.writerow(self.header)
            self._inserter = self._write_list
            
        elif row_is_list and not has_header:
            self._writer = unicodecsv.writer(f, delimiter=delimiter, 
                                             escapechar=self.escapechar, encoding=self.encoding)
            self._inserter = self._write_list

        else:
            raise Exception("Unexpected case for type {}".format(type(row)))
Example #22
0
def yield_csv_catalog(datasets):
    '''Yield a dataset catalog line by line'''
    csvfile = StringIO.StringIO()
    writer = unicodecsv.writer(csvfile, encoding='utf-8', delimiter=b',', quotechar=b'"')
    # Generate header
    specs = Metric.get_for(Dataset)
    writer.writerow(header(specs))
    yield csvfile.getvalue()

    for dataset in datasets:
        csvfile = StringIO.StringIO()
        writer = unicodecsv.writer(csvfile, encoding='utf-8', delimiter=b',', quotechar=b'"')
        writer.writerow(to_row(dataset, specs))
        yield csvfile.getvalue()
Example #23
0
def solve():
    for init_file in os.listdir(FOLDER):
        start_file = datetime.datetime.now()
        util.write_log(LOGS, "%s start\n" % init_file)
        rows = extract_info(FOLDER + init_file)
        filename = init_file.split(".")[0]
        folder = RESULT + "/" + filename
        os.mkdir(folder)
        os.chdir(folder)
        # select (cut) a part of rows to match
        file_idx = 0
        length = len(rows)
        row_written = -1
        last_diff = 0
        last_time = int(MAX_NUM)
        i = 0
        time_i = rows[0][TIME]
        mileage_i = rows[0][MILE]
        j = 1
        while j < length:
            mileage_j = rows[j][MILE]
            time_j = rows[j][TIME]
            if mileage_i == mileage_j:
                if (row_written != -1) and (time_j - last_time >= THIRTY_MINUTES):
                    if row_written != last_diff:
                        match(row_written, last_diff, rows)
                        test_over_speed(row_written, last_diff, rows)
                        with open(str(file_idx) + ".csv", "wb") as output_csv:
                            writer = ucsv.writer(output_csv)
                            writer.writerows(rows[row_written:last_diff + 1])
                        file_idx += 1
                    row_written = -1
            else:
                last_diff = i
                last_time = time_i
                if row_written == -1:
                    row_written = last_diff
            i += 1
            time_i = time_j
            mileage_i = mileage_j
            j += 1
        if (row_written != -1) and (row_written < length - 1):
            match(row_written, length - 1, rows)
            test_over_speed(row_written, length - 1, rows)
            with open(str(file_idx) + ".csv", "wb") as output_csv:
                writer = ucsv.writer(output_csv)
                writer.writerows(rows[row_written:])
        os.chdir("../..")
        end_file = datetime.datetime.now()
        util.write_log(LOGS, "%s finish , total costs %s\n" % (init_file, str(end_file - start_file)))
Example #24
0
    def output(self):
        identidad = 0
        line = []
        loan = []

        for afiliado in self.afiliados:
            if afiliado.cardID is None or afiliado.cardID == '0':
                identidad += 1
                continue

            line.append(
                    (
                        self.fecha.year,
                        self.fecha.month,
                        afiliado.cardID.replace('-', ''),
                        11,
                        afiliado.get_monthly(self.fecha)
                    )
            )
            loan.append(
                    (
                        self.fecha.year,
                        self.fecha.month,
                        afiliado.cardID.replace('-', ''),
                        11,
                        afiliado.get_prestamo()
                    )
            )

        mode = 'wb'
        if self.append:
            mode = 'ab+'
        planilla = unicodecsv.writer(
                io.open(os.path.join(directory,
                                     'INPREMA{0}.csv'.format(str(self.fecha))),
                        mode),
                quoting=csv.QUOTE_ALL, encoding='utf-8')
        prestamos = unicodecsv.writer(
                io.open(os.path.join(directory,
                                     'INPREMA{0}-prestamo.csv'.format(
                                             str(self.fecha))), mode),
                quoting=csv.QUOTE_ALL)

        print("Generando Colegiación")

        [planilla.writerow(l) for l in line]

        print("Generando Prestamos")
        [prestamos.writerow(l) for l in loan]
Example #25
0
def parsePapersToCSV(papers_filename):
	
	global PAPERS_OUT
	global REF_OUT

	papers_file = open(papers_filename, 'r')

	dblp_papers = open(PAPERS_OUT, "w")
	dblp_ref = open(REF_OUT, "w")

	paper_wr = csv.writer(dblp_papers, quoting=csv.QUOTE_ALL)
	ref_wr = csv.writer(dblp_ref)

	INDEX = ""
	row = []
	ref_line = []
	for rline in tqdm(papers_file):
		line = rline.decode("utf-8").rstrip().replace("\n","").replace("\"", "").replace('\\', "")
		if line.startswith("#index"):
			INDEX = line.lstrip("#index").strip()
			row.append(INDEX)
		if line.startswith("#*"):
			row.append(line.lstrip("#*"))
		if line.startswith("#c"):
			row.append(line.lstrip("#c"))
		if line.startswith("#t"):
			row.append(line.lstrip("#t"))
		if line.startswith("#%"):
			ref = line.lstrip("#%").strip()
			ref_line.append(ref)
		if line.startswith("#!"):
			abstract = line.lstrip("#!")
			if (abstract == ""):
				abstract = ""
			row.append(abstract)
		if line == "":
			paper_wr.writerow(row)
			
			for r in ref_line:
				if (r != ""):
					ref_wr.writerow([INDEX, r])

			row = []
			ref_line = []

	dblp_papers.close()
	dblp_ref.close()
	papers_file.close()
        def write_csvs(data):
            for key, value in data.items():
                with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_page.csv', 'wb') as f:
                    w = unicodecsv.writer(f)
                    w.writerow(('日期', '相对国美UV', '相对国美PV'))
                    w.writerow((str(datetime.datetime.today().date()),'%.5g' % (value['uv'] / float(data['gome']['uv'])),
                                '%.5g' % (value['pv'] / float(data['gome']['pv']))))

                    w.writerow(())
                    w.writerow(('页面', '比例'))
                    w.writerow(('商品详情页', '1'))
                    w.writerow(('收藏次数','%.5g' % (float(value['collection']) / value['pv'])))
                    w.writerow(('加入购物车成功页','%.5g' % (float(value['addcart']) / value['pv'])))
                    w.writerow(('购物车页','%.5g' % (float(value['cart']) / value['pv'])))
                    w.writerow(('订单填写页', '%.5g' % (float(value['orderwrite']) / value['pv'])))
                    w.writerow(('订单成功页','%.5g' % (float(value['ordersuccess']) / value['pv'])))

                    w.writerow(())
                    w.writerow(('会员中心页面', '相对全站UV占比', '相对全站PV占比', '会员页面访问时长'))
                    w.writerow(
                        ('-','%.5g' % (float(value['vip_uv']) / value['uv']),'%.5g' % (float(value['vip_pv']) / value['pv']), '-'))
                with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_search.csv', 'wb') as f:
                    w = unicodecsv.writer(f)
                    w.writerow(('TOP100搜索词', '占总体搜索次数百分比'))
                    w.writerows([(i,'%.5g' % (j / float(value['total_keyword_count']))) for i, j in value['top_100_keyword']])
                with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_category_and_product.csv', 'wb') as f:
                    w = unicodecsv.writer(f)
                    w.writerow(('流量TOP15品类名称', '占品类总流量的百分比'))
                    w.writerows([(i, '%.5g' % (j /  float(value['pv']))) for (i, j) in value['top_15_category']])
                with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_Beijing.csv', 'wb') as f:
                    w = unicodecsv.writer(f)
                    w.writerow(('省份/城市', '相对国美UV', '相对国美PV'))
                    w.writerow(('北京','%.5g' % (float(value['beijing_uv']) / data['gome']['beijing_uv']),
                                '%.5g' % (float(value['beijing_pv']) / data['gome']['beijing_pv'])))
                    w.writerow(('省份', '流量TOP5品类', '占该省份品类总流量的占比'))
                    w.writerows(
                        [('北京', i,'%.5g' % (float(j) / value['total_beijing_category'])) for i, j in
                         value['top_5_beijing_category']])
                    w.writerow(('省份', '流量TOP5商品', '占该省份品类总流量的占比','商品url'))
                    # print [('北京', i,'%.5g' % (float(j['count']) / value['total_beijing_product'],j['url'])) for i, j in
                    #      value['top_5_beijing_product']]
                    w.writerows(
                        [('北京', i,('%.5g' % (float(j['count']) / value['total_beijing_product'])),j['url']) for i, j in
                         value['top_5_beijing_product']])
                with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_referer.csv', 'wb') as f:
                    w = unicodecsv.writer(f)
                    w.writerow(('站外渠道名称', '流量占比'))
                    w.writerows((i,'%.5g' % j) for i,j in  value['referer_percent'])
Example #27
0
 def download_tweets_csv(self, request, queryset):
     f = StringIO()
     w = unicodecsv.writer(f, encoding='utf-8')
     for tweet in queryset:
         w.writerow((
             tweet['data']['id'],
             tweet['data']['text'],
             tweet['data']['timestamp'],
             tweet['data']['retweet_count'],
             tweet['data']['favorite_count'],
             tweet['data']['in_reply_to_status_id'],
             tweet['data']['in_reply_to_user_id'],
             tweet['data']['retweeted_status_id'],
             tweet['data']['coords'],
             tweet['data']['user']['screen_name'],
             tweet['data']['user']['id'],
             tweet['data']['user']['name'],
         ))
     f.seek(0)
     response = HttpResponse(
         f.read(),
         content_type='text/csv'
     )
     response['Content-Disposition'] = 'attachment;filename=export.csv'
     return response
Example #28
0
    def to_csv(
            self,
            hql,
            csv_filepath,
            schema='default',
            delimiter=',',
            lineterminator='\r\n',
            output_header=True):
        schema = schema or 'default'
        with self.get_conn() as conn:
            with conn.cursor() as cur:
                logging.info("Running query: " + hql)
                cur.execute(hql)
                schema = cur.description
                with open(csv_filepath, 'wb') as f:
                    writer = csv.writer(f, delimiter=delimiter,
                        lineterminator=lineterminator, encoding='utf-8')
                    if output_header:
                        writer.writerow([c[0]
                            for c in cur.description])
                    i = 0
                    while True:
                        rows = [row for row in cur.fetchmany() if row]
                        if not rows:
                            break

                        writer.writerows(rows)
                        i += len(rows)
                        logging.info("Written {0} rows so far.".format(i))
                    logging.info("Done. Loaded a total of {0} rows.".format(i))
Example #29
0
def main():
    jsonfile = urllib2.urlopen(BOARD_URL)
    jsons = json.loads(jsonfile.read())

    cards = jsons["cards"]
    lists = jsons["lists"]
    members = jsons["members"]

    # done or archived, actually
    donelist = filter(lambda x: x["name"] == "Done" or x["closed"] == True, lists)[0]
    donecards = filter(lambda x: x["idList"] == donelist["id"], cards)

    cardsWithMember = lambda m, x: m["id"] in x["idMembers"]
    membersCardCount = dict()
    for member in members:
        numDoneWithMember = len(filter(lambda x: cardsWithMember(member, x), donecards))
        membersCardCount[member["username"]] = numDoneWithMember

    for member in membersCardCount.keys():
        print member + "\t:", membersCardCount[member]

    idToUserName = dict()
    for member in members:
        idToUserName[member["id"]] = member["username"]

    with open("done.csv", "w") as donecsvfile:
        csvwriter = csv.writer(donecsvfile)
        csvwriter.writerows(map(lambda x: cardToRow(x, idToUserName),donecards))
Example #30
0
File: csv.py Project: holdenk/blaze
    def _extend(self, rows):
        rows = iter(rows)
        if sys.version_info[0] == 3:
            f = self.open(self.path, 'a', newline='')
        elif sys.version_info[0] == 2:
            f = self.open(self.path, 'ab')

        try:
            row = next(rows)
        except StopIteration:
            return
        if isinstance(row, dict):
            schema = dshape(self.schema)
            row = coerce_record_to_row(schema, row)
            rows = (coerce_record_to_row(schema, row) for row in rows)

        # Write all rows to file
        writer = csv.writer(f, **self.dialect)
        writer.writerow(row)
        writer.writerows(rows)

        try:
            f.close()
        except AttributeError:
            pass
Example #31
0
def retrivalByHashTag(numOfTopic, numOfCluster, clusters, hashtags):
    lda_result_dir = LDA_DIR + str(numOfTopic) + "/"
    kmean_result_dir = lda_result_dir + "KMEAN_" + str(numOfCluster) + "/"
    plot_dir = kmean_result_dir + "plot/"

    if not path.exists(plot_dir):
        makedirs(plot_dir)

    assignment_file = kmean_result_dir + "assignment_sorted.csv"

    docs = []
    with open(assignment_file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, encoding='utf-8')
        for row in spamreader:
            if int(row[0]) in clusters:
                docs.append(row[1])

    dates = []
    files = [
        f for f in listdir(LDA_DIR)
        if isfile(join(LDA_DIR, f)) and "docs" in f and ".csv" in f
    ]
    for file in files:
        with open(LDA_DIR + file, 'rb') as csvfile:
            spamreader = csv.reader(csvfile, encoding='utf-8')
            for row in spamreader:
                if row[0] in docs:
                    doc = row[3].lower()
                    tokens = tokenizer.tokenize(doc)
                    if any(i in hashtags for i in tokens):
                        postDate = row[5].split("T")[0]
                        postMonth = postDate[:-3]
                        dates.append(postMonth)

    sortedDates = sorted(dates, key=lambda d: map(int, d.split('-')))
    dateDict = Counter(sortedDates)

    startYear = int(sortedDates[0].split("-")[0])
    startMonth = sortedDates[0].split("-")[1]
    endYear = int(sortedDates[-1].split("-")[0])
    endMonth = sortedDates[-1].split("-")[1]

    print startYear, endYear
    cfile = ""
    for c in clusters:
        cfile = cfile + str(c) + "_"
    cfile = cfile[:-1]

    totalPost = len(docs)
    relevantPost = len(dates)

    plotWriter = csv.writer(open(plot_dir + 'plot_' + cfile + '.csv', 'wb'),
                            encoding='utf-8')
    plotWriter.writerow([sortedDates[0], sortedDates[-1]])
    plotWriter.writerow([totalPost, relevantPost])

    months = [
        "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"
    ]
    index = 1

    for y in range(startYear, endYear + 1):
        if y == startYear:
            for m in months[months.index(startMonth):]:
                d = str(y) + "-" + m
                if d in dateDict:
                    plotWriter.writerow([index, dateDict[d]])
                else:
                    plotWriter.writerow([index, 0])
                index = index + 1
            continue
        if y == endYear:
            for m in months[:months.index(endMonth) + 1]:
                d = str(y) + "-" + m
                if d in dateDict:
                    plotWriter.writerow([index, dateDict[d]])
                else:
                    plotWriter.writerow([index, 0])
                index = index + 1
            continue

        for m in months:
            d = str(y) + "-" + m
            if d in dateDict:
                plotWriter.writerow([index, dateDict[d]])
            else:
                plotWriter.writerow([index, 0])
            index = index + 1
Example #32
0
def getAnnoataion():
    annotation_file = LDA_DIR + "AnnotationTaskDrugRelated.csv"
    annotation = dict()
    with open(annotation_file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, encoding='utf-8')
        spamreader.next()
        for row in spamreader:
            annotation[row[0]] = row[1]
    annotation_counter = Counter(annotation.values())

    for t in NUM_OF_TOPIC:
        plotWriter = csv.writer(open(LDA_DIR + str(t) + 'clusterResult.csv',
                                     'wb'),
                                encoding='utf-8')
        for c in NUM_OF_CLUSTER:
            lda_result_dir = LDA_DIR + str(t) + "/"
            kmean_result_dir = lda_result_dir + "KMEAN_" + str(c) + "/"
            assignment_file = kmean_result_dir + "assignment_sorted.csv"
            clustering = dict()
            with open(assignment_file, 'rb') as csvfile:
                spamreader = csv.reader(csvfile, encoding='utf-8')
                for row in spamreader:
                    if row[0] in clustering:
                        clustering[
                            row[0]]["count"] = clustering[row[0]]["count"] + 1
                        if row[1] in annotation:
                            label = annotation[row[1]]
                            if label in clustering[row[0]]:
                                clustering[row[0]][label] = clustering[
                                    row[0]][label] + 1
                            else:
                                clustering[row[0]][label] = 1
                    else:
                        clustering[row[0]] = dict()
                        clustering[row[0]]["count"] = 1
                        if row[1] in annotation:
                            label = annotation[row[1]]
                            clustering[row[0]][label] = 1
            plotWriter.writerow(["K=" + str(c)])
            for i in range(0, c):
                h = str(i)
                print i, clustering[h]["count"]
                plotWriter.writerow(
                    ["cluster" + str(i), clustering[h]["count"]])
                plotWriter.writerow(
                    ["MedicalUse", "DrugAbuse", "Not Related", "Not Sure"])
                docCount = []
                docPrecent = []
                for j in ['1', '2', '3', '4']:
                    num = 0
                    numPrecent = 0.0
                    if j in clustering[h]:
                        num = clustering[h][j]  #/ float(annotation_counter[j])
                        numPrecent = clustering[h][j] / float(
                            annotation_counter[j])
                    docCount.append(num)
                    docPrecent.append(numPrecent)
                docCount.append(docCount[1] - docCount[2])
                docCount.append(docPrecent[1] - docPrecent[2])
                plotWriter.writerow(docCount)
            plotWriter.writerow([])
    "Danish": "Denmark",
    "Carinthian": "Trieste",
    "Schleswiger": "Kiel",
    "Polish": "Warsaw"
}

if __name__ == "__main__":
    for fname in glob.glob(foldername + "*.results"):
        gamename = basename(fname)
        gamename, _ = splitext(gamename)  # splitext is more robust
        print "Processing ", gamename
        reader = unicodecsv.reader(open(fname, "rb"),
                                   encoding="utf8",
                                   lineterminator="\n")
        gswriter = unicodecsv.writer(open(
            gamestatefolder + gamename + ".gamestate", "wb"),
                                     encoding="utf8",
                                     lineterminator="\n")

        # skip header
        reader.next()
        try:
            countries = {}
            supplycenter = {}
            r = reader.next()
            if r[1].lower().count("starting") == 0:
                continue
            lines = r[2].split("\n")
            for line in lines:
                if line.count("Army") != 0 or line.count("Fleet") != 0:
                    l = line[:-1].split()
                    country = l[0][:-1]
Example #34
0
def merge_unload(data,
                 delimiter,
                 file_name,
                 table,
                 stage_table,
                 upserts_dml_statements,
                 unload_query,
                 unload_bucket_name,
                 unload_key_name,
                 unload_status_query='',
                 additional_copy_options='',
                 additional_unload_options=''):
    """
	Refer this link for more details about redshift merge feature.
	https://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
	This function handles merge and unload to s3 in one single transaction.
	It creates the temp stage table identical to main table.
    Copy data from memory to s3 as a file and load the file is s3 using Redshfit copy command.
    Apply merge queries to update / insert into main table. The callee function has to pass the merge queries.
    Also callee function has to pass unload query and the query needs to be run to mark the rows being copied to s3.
	"""
    try:

        with tempfile.NamedTemporaryFile() as temp:
            fn = temp.name
            with open(fn, "w") as f:
                writer = csv.writer(f, delimiter=delimiter)
                writer.writerows(data)

            logger.info("Loading data to s3")
            session = boto3.Session(
                aws_access_key_id=os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'],
                aws_secret_access_key=os.environ['DATA_ETL_AWS_SECRET_KEY_ID'])
            s3 = session.resource('s3')
            s3_bucket_load = S3_BUCKET_NAME
            s3_key_load = '{}/temp/{}.csv'.format(S3_PREFIX_KEY, file_name)
            object = s3.Object(s3_bucket_load, s3_key_load)
            object.upload_file(fn)
            upsert_statements = []
            create_stage_table_sql = "create temp table %s (like %s including defaults);" % (
                stage_table, table)

            sql_copy = '''COPY {} FROM 's3://{}/{}'  CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}'
											REGION as 'us-west-2'
											STATUPDATE ON COMPUPDATE ON delimiter '{}' {};
											'''.format(stage_table, s3_bucket_load, s3_key_load,
                      os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'],
                      os.environ['DATA_ETL_AWS_SECRET_KEY_ID'], delimiter,
                      additional_copy_options)

            sql_unload = '''UNLOAD($${}$$) TO 's3://{}/{}' CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' 
							delimiter ','  {} ;'''.format(unload_query, unload_bucket_name,
                                     unload_key_name,
                                     os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'],
                                     os.environ['DATA_ETL_AWS_SECRET_KEY_ID'],
                                     additional_unload_options)

            upsert_statements.extend([
                create_stage_table_sql,
                sql_copy,
            ])
            upsert_statements.extend(upserts_dml_statements)
            upsert_statements.append(sql_unload)
            if unload_status_query is not None:
                upsert_statements.append(unload_status_query)
            execute_queries(upsert_statements)
            object.delete()
    except:
        e = sys.exc_info()
        logger.error("Error occurred in load_extract function ".format(e))
        raise
Example #35
0
# -*- coding: utf-8 -*-

import os
import re
import unicodecsv as csv
from google.cloud import translate

translate_client = translate.Client()

dirs = [
    d
    for d in os.walk("/Users/levisrael/sefaria/Aspaklaria/www.aspaklaria.info")
]
fnames = [
    fname for (dirpath, dirnames, filenames) in dirs[1:]
    for fname in filenames[1:]
]
clean_names = [
    re.sub("\s+", " ",
           f.replace(".html", "").strip()) for f in fnames
]

with open("Aspaklaria-Headwords.csv", "w") as csvout:
    csvout = csv.writer(csvout)
    for tag in clean_names:
        translation = translate_client.translate(tag,
                                                 target_language='en',
                                                 source_language='iw')
        csvout.writerow([tag, translation['translatedText']])
Example #36
0
def generate_user_report(output_filename):
    import unicodecsv
    from django.core.files.storage import get_storage_class
    from kpi.deployment_backends.kc_access.shadow_models import (
        KobocatUser,
        KobocatUserProfile,
        KobocatXForm,
    )
    from hub.models import ExtraUserDetail

    def format_date(d):
        if hasattr(d, 'strftime'):
            return d.strftime('%F')
        else:
            return d

    def get_row_for_user(u: KobocatUser) -> list:
        row_ = []

        try:
            profile = KobocatUserProfile.objects.get(user=u)
        except KobocatUserProfile.DoesNotExist:
            profile = None

        try:
            extra_user_detail = ExtraUserDetail.objects.get(user_id=u.pk)
        except ExtraUserDetail.DoesNotExist:
            extra_details = None
        else:
            extra_details = extra_user_detail.data

        row_.append(u.username)
        row_.append(u.email)
        row_.append(u.pk)
        row_.append(u.first_name)
        row_.append(u.last_name)

        if extra_details:
            name = extra_details.get('name', '')
        else:
            name = ''
        if name:
            row_.append(name)
        elif profile:
            row_.append(profile.name)
        else:
            row_.append('')

        if extra_details:
            organization = extra_details.get('organization', '')
        else:
            organization = ''
        if organization:
            row_.append(organization)
        elif profile:
            row_.append(profile.organization)
        else:
            row_.append('')

        row_.append(KobocatXForm.objects.filter(user=u).count())

        if profile:
            row_.append(profile.num_of_submissions)
        else:
            row_.append(0)

        row_.append(format_date(u.date_joined))
        row_.append(format_date(u.last_login))

        return row_

    CHUNK_SIZE = 1000
    columns = [
        'username',
        'email',
        'pk',
        'first_name',
        'last_name',
        'name',
        'organization',
        'XForm count',
        'num_of_submissions',
        'date_joined',
        'last_login',
    ]

    default_storage = get_storage_class()()
    with default_storage.open(output_filename, 'wb') as output_file:
        writer = unicodecsv.writer(output_file)
        writer.writerow(columns)
        kc_users = KobocatUser.objects.exclude(
            pk=settings.ANONYMOUS_USER_ID
        ).order_by('pk')
        for kc_user in kc_users.iterator(CHUNK_SIZE):
            try:
                row = get_row_for_user(kc_user)
            except Exception as e:
                row = ['!FAILED!', 'User PK: {}'.format(kc_user.pk), repr(e)]
            writer.writerow(row)
Example #37
0
	def handle(self, *args, **options):
		#read xml from url
		file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportSRIRegistrationData'+'?pUsername=%s&pPassword=%s' % (settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD)
		url = urllib2.urlopen(file_url)
		contents = url.read()
		xml_file = open("jslps_data_integration_files/person.xml", 'w')
		xml_file.write(contents)
		xml_file.close()

		partner = Partner.objects.get(id = 24)
		user_obj = User.objects.get(username="******")
		csv_file = open('jslps_data_integration_files/person_error.csv', 'wb')
		wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
		tree = ET.parse('jslps_data_integration_files/person.xml')
		root = tree.getroot()

		for c in root.findall('SRIRegistrationData'):
			pc = c.find('MemID').text
			pn = unicode(c.find('MemberName').text)
			if c.find('FatherName') is not None:
				pfn = unicode(c.find('FatherName').text)
			else:
				pfn = ''
			if c.find('PhoneNo') is not None:
				phone = c.find('PhoneNo').text
			else:
				phone = ''
			if c.find('Age') is not None:
				age = int(c.find('Age').text)
			else:
				age = None
			if c.find('Gender') is not None:
				gender = c.find('Gender').text
			else:
				gender = 'F'
			vc = c.find('VillageCode').text
			gc = c.find('GroupCode').text

			try:
				village = JSLPS_Village.objects.get(village_code = vc)
			except JSLPS_Village.DoesNotExist as e:
				wtr.writerow(['JSLPS village not EXIST: '+str(pc), vc, e])
				continue
			error = 0
			try:
				group = JSLPS_Persongroup.objects.get(group_code = gc)
			except JSLPS_Persongroup.DoesNotExist as e:
				wtr.writerow(['JSLPS group not EXIST: '+str(pc), gc, e])
				error = 1

			if error == 1:
				try:
					person, created = \
						Person.objects.get_or_create(person_name = pn,
													 father_name = pfn,
													 partner=partner,
													 gender=gender,
													 village = village.Village,
													 )
					person.age = age
					person.phone_no = phone
					person.user_created_id = user_obj.id
					person.save()
					jslps.new_count += 1
				except Exception as e:
					person = None
					if "Duplicate entry" not in str(e):
						jslps.other_error_count += 1
						wtr.writerow(['Error in save person group: ',pc, e])
					else:
						jslps.duplicate_count += 1
			else:
				try:
					person, created = \
						Person.objects.get_or_create(person_name = pn,
													 father_name = pfn,
													 partner=partner,
													 gender=gender,
													 village = village.Village,
													 )
					person.age = age
					person.phone_no = phone
					person.group=group.group
					person.user_created_id = user_obj.id
					person.save()
					jslps.new_count += 1
				except Exception as e:
					person = None
					if "Duplicate entry" not in str(e):
						jslps.other_error_count += 1
						wtr.writerow(['Error in save person (group(%s)): '%(str(gc)),pc, e])
					else:
						jslps.duplicate_count += 1

			if person != None:
				jslps_person_list = JSLPS_Person.objects.filter(person_code=pc)
				if jslps_person_list.count() == 0:
					if group is not None:
						jslps_person, created = \
							JSLPS_Person.objects.get_or_create(person_code=pc,
															   person=person,
															   user_created_id=user_obj.id,
															   activity="LIVELIHOOD"
															   )
						jslps_person.group=group
						jslps_person.save()
					else:
						jslps_person, created = \
							JSLPS_Person.objects.get_or_create(person_code=pc,
															   person=person,
															   user_created_id=user_obj.id,
															   activity="LIVELIHOOD",
															   )
				else:
					jslps_person = jslps_person_list[0]
					jslps_person.person = person
					jslps_person.save()
			else:
				person_list = Person.objects.filter(person_name = pn,father_name = pfn,village = village.Village)
				if person_list.count() != 0:
					person = person_list[0]
					jslps_person_list = JSLPS_Person.objects.filter(person_code=pc,person=person)
					if jslps_person_list.count() == 0:
						if group is not None:
							jslps_person, created = \
								JSLPS_Person.objects.get_or_create(person_code=pc,
																   person=person,
																   user_created_id=user_obj.id,
																   activity="LIVELIHOOD"
																   )
							jslps_person.group=group
							jslps_person.save()
						else:
							jslps_person, created = \
								JSLPS_Person.objects.get_or_create(person_code=pc,
																   person=person,
																   user_created_id=user_obj.id,
																   activity="LIVELIHOOD",
																   )
					else:
						jslps_person = jslps_person_list[0]
						if jslps_person.person == None:
							jslps_person.person = person
							jslps_person.save()
				else:
					wtr.writerow(['Person not saved and duplicate also not exist',pc, "not saved"])

		csv_file.close()




			
Example #38
0
            col = td.text.strip()
            if (tdC == 0):
                newCol = col.split('\n')
                try:
                    if len(newCol[1]) > 0:
                        rowData.append(newCol[1])
                except:
                    continue
            else:
                col.strip('"')
                #takes quotes off wages
                if tdC < 3:
                    rowData.append(col)  #ignoring 3rd column since it's vacant
                if tdC > 3:
                    col = col[1:].replace(",", "")  #removes commas from wages
                    try:
                        col = int(col)
                        rowData.append(int(col))
                    except:
                        continue
            tdC += 1
        if len(rowData) > 0:  #only append if there's actual data
            if type(rowData[-1]
                    ) is int:  #only accepting data with player wages
                rowData.append(clubName[i])
                data.append(rowData)

with open("playerData.csv", 'a') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerows(data)
Example #39
0
            else:
                trimmed_record[column] = "Unknown"
    if "nominate" in record:
        for column in output_columns["nominate_columns"]:
            if record["nominate"] is not None and column in record["nominate"]:
                trimmed_record[column] = record["nominate"][column]
            else:
                trimmed_record[column] = "Unknown"
    for column in output_columns["record_columns"]:
        if column in record:
            trimmed_record[column] = record[column]
        else:
            trimmed_record[column] = "Unknown"
    output.append(trimmed_record)
    # except Exception as e:
    #     print e
    #     print "(moving on)"
print "Output compiled ({}/{} matched records included)".format(
    str(len(output)), str(len(matched)))
print "Writing JSON..."
with open(output_location, "w") as jsonout:
    json.dump(output, jsonout, indent=4)
if csv_location is not None:
    print "Writing CSV..."
    with open(csv_location, "w") as csvfile:
        writer = csv.writer(csvfile, encoding="utf-8")
        writer.writerow(output_columns["order"])
        for match in tqdm(matched, desc="write"):
            writer.writerow(generate_row(match))
print "Done!"
Example #40
0
    def handle(self, *args, **options):
        #read xml from url
        file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportGroupMemberDataHnN' + '?pUsername=%s&pPassword=%s' % (
            settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD)
        url = urllib2.urlopen(file_url)
        contents = url.read()
        xml_file = open("jslps_data_integration_files/jslps-hnn-person.xml",
                        'w')
        xml_file.write(contents)
        xml_file.close()

        partner = Partner.objects.get(id=24)
        user_obj = User.objects.get(username="******")
        csv_file = open('jslps_data_integration_files/jslps-hnn-person.csv',
                        'wb')
        wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
        tree = ET.parse('jslps_data_integration_files/jslps-hnn-person.xml')
        root = tree.getroot()

        for c in root.findall('GroupMemberData'):
            district_code = c.find('DistrictCode').text
            block_code = c.find('BlockCode').text
            village_code = c.find('VillageCode').text
            group_code = c.find('GroupCode').text
            member_code = c.find('Group_M_Code').text
            member_name = c.find('MemberName').text
            father_name = c.find('FatherName').text if c.find(
                'FatherName') else 'X'
            gender = 'F'

            try:
                village = JSLPS_Village.objects.get(village_code=village_code)
            except JSLPS_Village.DoesNotExist as e:
                wtr.writerow([
                    'JSLPS village not EXIST: ' + str(member_code),
                    village_code, group_code, e
                ])
                continue
            error = 0
            try:
                group = JSLPS_Persongroup.objects.get(group_code=group_code)
            except JSLPS_Persongroup.DoesNotExist as e:
                wtr.writerow([
                    'JSLPS group not EXIST: ' + str(member_code), group_code, e
                ])
                error = 1
            # where group is not present we are trying to save person
            if error == 1:
                try:
                    person, created = \
                     Person.objects.get_or_create(person_name=member_name,
                             partner=partner,
                             gender=gender,
                             village = village.Village,
                             )
                    jslps.new_count += 1
                except Exception as e:
                    person = None
                    if "Duplicate entry" not in str(e):
                        jslps.other_error_count += 1
                        wtr.writerow(
                            ['Error in save person group: ', member_code, e])
                    else:
                        jslps.duplicate_count += 1
            # where group is present we are trying to save person
            else:
                try:
                    person, created = \
                     Person.objects.get_or_create(person_name=member_name,
                             partner=partner,
                             gender=gender,
                             village = village.Village,
                             father_name=father_name,
                             group=group.group,
                             user_created_id=user_obj.id
                             )
                    # person.group=group.group
                    # person.user_created_id = user_obj.id
                    # person.save()
                    jslps.new_count += 1
                except Exception as e:
                    # import pdb;pdb.set_trace()
                    person = None
                    if "Duplicate entry" not in str(e):
                        jslps.other_error_count += 1
                        wtr.writerow([
                            'Error in save person (group(%s)): ' %
                            (str(group_code)), member_code, e
                        ])
                    else:
                        jslps.duplicate_count += 1

            if person != None:
                jslps_person_list = JSLPS_Person.objects.filter(
                    person_code=member_code)
                if len(jslps_person_list) == 0:
                    if group is not None:
                        jslps_person, created = \
                         JSLPS_Person.objects.get_or_create(person_code=member_code,
                                    person=person,
                                    user_created_id=user_obj.id,
                                    activity="HNN"
                                    )
                        jslps_person.group = group
                        jslps_person.save()
                    else:
                        jslps_person, created = \
                         JSLPS_Person.objects.get_or_create(person_code=member_code,
                                    person=person,
                                    user_created_id=user_obj.id,
                                    activity="HNN",
                                    )
                else:
                    jslps_person = jslps_person_list[0]
                    jslps_person.person = person
                    jslps_person.save()
            else:
                person_list = Person.objects.filter(person_name=member_name,
                                                    village=village.Village)
                if len(person_list) != 0:
                    person = person_list[0]
                    jslps_person_list = JSLPS_Person.objects.filter(
                        person_code=member_code, person=person)
                    if len(jslps_person_list) == 0:
                        if group is not None:
                            jslps_person, created = \
                             JSLPS_Person.objects.get_or_create(person_code=member_code,
                                        person=person,
                                        user_created_id=user_obj.id,
                                        activity="HNN"
                                        )
                            jslps_person.group = group
                            jslps_person.save()
                        else:
                            jslps_person, created = \
                             JSLPS_Person.objects.get_or_create(person_code=member_code,
                                        person=person,
                                        user_created_id=user_obj.id,
                                        activity="HNN",
                                        )
                    else:
                        jslps_person = jslps_person_list[0]
                        if jslps_person.person == None:
                            jslps_person.person = person
                            jslps_person.save()
                else:
                    wtr.writerow([
                        'Person not saved and duplicate also not exist',
                        member_code, "not saved"
                    ])

        csv_file.close()
Example #41
0
def campaign_view_counts(campaign):
    timezone = g.user.timezone if g.user else 'UTC'
    viewdict = defaultdict(dict)

    interval = chart_interval_for(campaign)

    hourly_views = db.session.query('hour', 'count').from_statement(db.text(
        '''SELECT date_trunc(:interval, campaign_view.datetime AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_view WHERE campaign_id=:campaign_id GROUP BY hour ORDER BY hour;'''
    )).params(interval=interval, timezone=timezone, campaign_id=campaign.id)

    for hour, count in hourly_views:
        viewdict[hour]['_views'] = count

    hourly_views = db.session.query('hour', 'count').from_statement(db.text(
        '''SELECT date_trunc(:interval, campaign_anon_view.datetime AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_anon_view WHERE campaign_id=:campaign_id GROUP BY hour ORDER BY hour;'''
    )).params(interval=interval, timezone=timezone, campaign_id=campaign.id)

    for hour, count in hourly_views:
        viewdict[hour]['_views'] = viewdict[hour].setdefault('_views', 0) + count

    hourly_views = db.session.query('hour', 'count').from_statement(db.text(
        '''SELECT date_trunc(:interval, campaign_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(user_id)) AS count FROM campaign_user_action WHERE action_id IN (SELECT id FROM campaign_action WHERE campaign_id = :campaign_id AND type != :dismiss_type) GROUP BY hour ORDER BY hour;'''
        )).params(interval=interval, timezone=timezone, campaign_id=campaign.id, dismiss_type=CAMPAIGN_ACTION.DISMISS)

    for hour, count in hourly_views:
        viewdict[hour]['_combined'] = count

    hourly_views = db.session.query('hour', 'count').from_statement(db.text(
        '''SELECT date_trunc(:interval, campaign_anon_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(anon_user_id)) AS count FROM campaign_anon_user_action WHERE action_id IN (SELECT id FROM campaign_action WHERE campaign_id = :campaign_id AND type != :dismiss_type) GROUP BY hour ORDER BY hour;'''
        )).params(interval=interval, timezone=timezone, campaign_id=campaign.id, dismiss_type=CAMPAIGN_ACTION.DISMISS)

    for hour, count in hourly_views:
        viewdict[hour]['_combined'] = viewdict[hour].setdefault('_combined', 0) + count

    action_names = []

    for action in campaign.actions:
        action_names.append(action.name)
        hourly_views = db.session.query('hour', 'count').from_statement(db.text(
            '''SELECT date_trunc(:interval, campaign_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_user_action WHERE action_id=:action_id GROUP BY hour ORDER BY hour;'''
        )).params(interval=interval, timezone=timezone, action_id=action.id)
        for hour, count in hourly_views:
            viewdict[hour][action.name] = count

        hourly_views = db.session.query('hour', 'count').from_statement(db.text(
            '''SELECT date_trunc(:interval, campaign_anon_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_anon_user_action WHERE action_id=:action_id GROUP BY hour ORDER BY hour;'''
        )).params(interval=interval, timezone=timezone, action_id=action.id)
        for hour, count in hourly_views:
            viewdict[hour][action.name] = viewdict[hour].setdefault(action.name, 0) + count

    if viewdict:
        # Top-off with site-wide user presence (available since 31 Jan 2015 in user_active_at)
        minhour = g.user.tz.localize(min(viewdict.keys())).astimezone(UTC).replace(tzinfo=None)
        maxhour = g.user.tz.localize(max(viewdict.keys()) + timedelta(seconds=3599)).astimezone(UTC).replace(tzinfo=None)

        hourly_views = db.session.query('hour', 'count').from_statement(db.text(
            '''SELECT date_trunc(:interval, user_active_at.active_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(user_active_at.user_id)) AS count FROM user_active_at WHERE user_active_at.active_at >= :min AND user_active_at.active_at <= :max GROUP BY hour ORDER BY hour;'''
            )).params(interval=interval, timezone=timezone, min=minhour, max=maxhour)

        for hour, count in hourly_views:
            viewdict[hour]['_site'] = count

        hourly_views = db.session.query('hour', 'count').from_statement(
            '''SELECT DATE_TRUNC(:interval, event_session.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(anon_user_id)) AS count FROM event_session WHERE event_session.anon_user_id IS NOT NULL AND event_session.created_at >= :min AND event_session.created_at <= :max GROUP BY hour ORDER BY hour'''
            ).params(interval=interval, timezone=timezone, min=minhour, max=maxhour)

        for hour, count in hourly_views:
            viewdict[hour]['_site'] = viewdict[hour].setdefault('_site', 0) + count

    viewlist = []
    for slot in viewdict:
        row = [slot, viewdict[slot].get('_site', 0), viewdict[slot].get('_views', 0), viewdict[slot].get('_combined', 0)]
        for name in action_names:
            row.append(viewdict[slot].get(name, 0))
        viewlist.append(row)

    viewlist.sort()  # Sorts by first column, the hour slot (datetime)
    for row in viewlist:
        row[0] = row[0].isoformat()

    outfile = StringIO()
    out = unicodecsv.writer(outfile, 'excel')
    out.writerow(['_hour', '_site', '_views', '_combined'] + action_names)
    out.writerows(viewlist)

    return outfile.getvalue(), 200, {'Content-Type': 'text/plain'}
Example #42
0
    def handle(self, *args, **options):
        #saving videos
        file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportVedioMasterData' + '?pUsername=%s&pPassword=%s' % (
            settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD)
        url = urllib2.urlopen(file_url)
        contents = url.read()
        xml_file = open("jslps_data_integration_files/video.xml", 'w')
        xml_file.write(contents)
        xml_file.close()

        partner = Partner.objects.get(id=24)
        csv_file = open('jslps_data_integration_files/videos_error.csv', 'wb')
        wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
        tree = ET.parse('jslps_data_integration_files/video.xml')
        root = tree.getroot()
        for c in root.findall('VedioMasterData'):
            vdc = c.find('VideoID').text
            vn = c.find('VideoTitle').text
            vt = int(c.find('VideoType').text)
            if c.find('Category') is not None:
                cg = int(c.find('Category').text)
            else:
                cg = None
                jslps.other_error_count += 1
                wtr.writerow([
                    'Can not save video without category', vdc, 'title', vn, e
                ])
                continue
            if c.find('SubCategory') is not None:
                scg = int(c.find('SubCategory').text)
            else:
                scg = None
                jslps.other_error_count += 1
                wtr.writerow([
                    'Can not save video without category', vdc, 'title', vn, e
                ])
                continue
            if c.find('Practice') is not None:
                vp = int(c.find('Practice').text)
            else:
                vp = None
                jslps.other_error_count += 1
                wtr.writerow([
                    'Can not save video without category', vdc, 'title', vn, e
                ])
                continue
            if c.find('YouTubeID') is not None:
                yid = c.find('YouTubeID').text
            else:
                yid = ''
            pd = datetime.strptime(c.find('ProductionDate').text, '%d/%m/%Y')
            if c.find('ApprovalDt') is not None:
                ad = datetime.strptime(c.find('ApprovalDt').text, '%d/%m/%Y')
            else:
                ad = None
            #sd = datetime.strptime(c.find('StartDt').text, '%d/%m/%Y')
            #ed = datetime.strptime(c.find('EndDt').text, '%d/%m/%Y')
            ln = int(c.find('Video_Language').text)
            if (ln == 2):
                ln = 18
            elif (ln == 3):
                ln = 10
            if c.find('Benefit') is not None:
                benefit = unicode(c.find('Benefit').text)
            else:
                benefit = ''
            vc = c.find('VillageCode').text
            pro_team = c.find('ProductionTeam').text.split(',')

            error = 0
            try:
                village = JSLPS_Village.objects.get(village_code=vc)
                language = Language.objects.get(id=ln)
                try:
                    facililator = JSLPS_Animator.objects.get(
                        animator_code=pro_team[0])
                except JSLPS_Animator.DoesNotExist as e:
                    facililator = JSLPS_Animator.objects.get(
                        animator_code=str(4))
                try:
                    camera_operator = JSLPS_Animator.objects.get(
                        animator_code=pro_team[1])
                except JSLPS_Animator.DoesNotExist as e:
                    camera_operator = JSLPS_Animator.objects.get(
                        animator_code=str(4))
                try:
                    category = Category.objects.get(id=cg)
                except Category.DoesNotExist as e:
                    category = None
                    jslps.other_error_count += 1
                    wtr.writerow([
                        'Can not save video without category', vdc, 'title',
                        vn, e
                    ])
                    continue
                try:
                    subcategory = SubCategory.objects.get(id=scg)
                except SubCategory.DoesNotExist as e:
                    subcategory = None
                    jslps.other_error_count += 1
                    wtr.writerow([
                        'Can not save video without subcategory', vdc, 'title',
                        vn, e
                    ])
                    continue
                try:
                    videopractice = VideoPractice.objects.get(id=vp)
                except VideoPractice.DoesNotExist as e:
                    videopractice = None
                    jslps.other_error_count += 1
                    wtr.writerow([
                        'Can not save video without practice', vdc, 'title',
                        vn, e
                    ])
                    continue

            except (JSLPS_Village.DoesNotExist, Language.DoesNotExist) as e:
                print e
                if "Duplicate entry" not in str(e):
                    jslps.other_error_count += 1
                    wtr.writerow(['village', vc, 'title', vn, e])
                error = 1

            if (error == 0):
                video_set = dict(
                    Video.objects.filter(
                        village_id=village.Village.id).values_list(
                            'title', 'village'))
                video_db = []
                video_xml = str(vn) + str(village.Village.id)
                for key, value in video_set.iteritems():
                    name = str(key) + str(value)
                    video_db.append(name)
                if video_xml not in video_db:
                    try:
                        vid = Video(title=vn,
                                    video_type=vt,
                                    language=language,
                                    benefit=benefit,
                                    production_date=pd,
                                    village_id=village.Village.id,
                                    partner=partner,
                                    approval_date=ad,
                                    youtubeid=yid,
                                    category=category,
                                    subcategory=subcategory,
                                    videopractice=videopractice)
                        vid.save()
                        jslps.new_count += 1
                        print "video saved"
                    except Exception as e:
                        print vdc, e
                        if "Duplicate entry" in str(e):
                            jslps.duplicate_count += 1
                        else:
                            jslps.other_error_count += 1
                            wtr.writerow(['video save', vdc, e])

                    try:
                        vid = Video.objects.get(title=vn,
                                                village_id=village.Village.id,
                                                partner_id=partner.id)
                        vid.production_team.add(facililator.animator)
                        vid.save()
                        jslps.new_count += 1
                        vid.production_team.add(camera_operator.animator)
                        vid.save()
                        jslps.new_count += 1
                        print "farmer shown saved"
                    except Exception as e:
                        if "Duplicate entry" in str(e):
                            jslps.duplicate_count += 1
                        else:
                            jslps.other_error_count += 1
                            wtr.writerow(['production team save', e])

                    video_added = []
                    video = None
                    try:
                        video = Video.objects.filter(
                            title=vn,
                            village_id=village.Village.id,
                            partner_id=partner.id).get()
                        video_added = JSLPS_Video.objects.values_list(
                            'vc', flat=True)
                        #video_added = [i[0] for i in video_added]
                    except Exception as e:
                        print e
                    try:
                        if vdc not in video_added:
                            vj = JSLPS_Video(vc=vdc, video=video)
                            vj.save()
                    except Exception as e:
                        print vdc, e
                        if "Duplicate entry" not in str(e):
                            jslps.other_error_count += 1
                            wtr.writerow(['JSLPS Video save', vdc, e])

        #saving non-negotiables
        file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportVedioNon_NegotiableMasterData' + '?pUsername=%s&pPassword=%s' % (
            settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD)
        url = urllib2.urlopen(file_url)
        contents = url.read()
        xml_file = open("jslps_data_integration_files/nn.xml", 'w')
        xml_file.write(contents)
        xml_file.close()

        partner = Partner.objects.get(id=24)
        tree = ET.parse('jslps_data_integration_files/nn.xml')
        root = tree.getroot()

        for c in root.findall('VedioNon_NegotiableMasterData'):
            nn_c = c.find('Non_NegotiablesID').text
            vdc = int(c.find('VideoId').text)
            nn_n = c.find('Non_NegotiablesName').text
            vr = c.find('Verification').text
            if vr == 'false':
                vr = False
            else:
                vr = True
            error = 0
            try:
                video = JSLPS_Video.objects.get(vc=vdc)
            except JSLPS_Video.DoesNotExist as e:
                error = 1
            if error == 0:
                try:
                    nonnego_already_exist = NonNegotiable.objects.filter(
                        video_id=video.video_id,
                        non_negotiable=nn_n,
                        physically_verifiable=vr)
                    if len(nonnego_already_exist) == 0:
                        nn = NonNegotiable(video_id=video.video_id,
                                           non_negotiable=nn_n,
                                           physically_verifiable=vr)
                        nn.save()
                        jslps.new_count += 1
                except Exception as e:
                    print e
                    if "Duplicate entry" in str(e):
                        jslps.duplicate_count += 1
                    else:
                        jslps.other_error_count += 1
                        wtr.writerow(['Non nego', nn_c, 'video', vdc, e])
Example #43
0
    ignored = {}
    available = {}
    for fname in os.listdir(FINAL):
        tid = fname.split("_")[1].split(".")[0]
        ignored[tid] = 0

    # Get remaining unlabeled tweets
    for fname in os.listdir(TWEETS):
        tid = fname.split("_")[1].split(".")[0]
        if tid not in ignored:
            p = os.path.join(TWEETS, fname)
            with codecs.open(p, "rb", encoding="utf-8") as f:
                tweet = json.load(f, encoding="utf-8")
            text = tweet["text"]
            # Remove all line breaks in a tweet
            text = text.replace('\n', ' ').replace('\r', '')
            available[tid] = text
            if "\t" in available[tid]:
                print "tab", tid

    # Store in csv file
    with open(DST, "wb") as f:
        writer = csv.writer(f,
                            dialect='excel',
                            encoding='utf-8',
                            delimiter="\t")
        # Write header
        for tid in available:
            text = available[tid]
            writer.writerow([tid, text])
Example #44
0
def dump_csv(array, delimiter=','):
    f = io.BytesIO()
    writer = csv.writer(f, delimiter=delimiter, quoting=csv.QUOTE_ALL)
    writer.writerow(array)
    return f.getvalue()[:-2].decode('utf-8')
Example #45
0
def contracts_csv(request, unit_slug, semester):
    hiring_semester = get_object_or_404(HiringSemester,
                                        semester__name=semester,
                                        unit__in=request.units,
                                        unit__label=unit_slug)

    contracts = TAContract.objects.signed(hiring_semester)

    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'inline; filename="%s.csv"' % (
        hiring_semester.semester.name)
    writer = csv.writer(response)
    writer.writerow([
        'Batch ID', 'Term ID', 'Contract Signed', 'Benefits Indicator',
        'EmplID', 'SIN', 'Last Name', 'First Name 1', 'First Name 2',
        'Payroll Start Date', 'Payroll End Date', 'Action', 'Action Reason',
        'Position Number', 'Job Code', 'Full_Part time', 'Pay Group',
        'Employee Class', 'Category', 'Fund', 'Dept ID (cost center)',
        'Project', 'Account', 'Prep Units', 'Base Units', 'Appt Comp Freq',
        'Semester Base Salary Rate', 'Biweekly Base Salary Pay Rate',
        'Hourly Rate', 'Standard Hours', 'Scholarship Rate Code',
        'Semester Scholarship Salary Pay Rate',
        'Biweekly Scholarship Salary Pay Rate', 'Lump Sum Amount',
        'Lump Sum Hours', 'Scholarship Lump Sum'
    ])

    seq = hiring_semester.next_export_seq()
    batchid = '%s_%s_%02i' % (hiring_semester.unit.label,
                              datetime.date.today().strftime("%Y%m%d"), seq)

    for c in contracts:
        bu = c.bu
        total_bu = c.total_bu
        prep_units = c.total_bu - c.bu

        signed = 'Y'
        benefits = 'Y'
        schol_rate = 'TSCH' if c.scholarship_per_bu > 0 else ''
        salary_total = c.total_pay
        schol_total = c.scholarship_pay
        if prep_units == 0:
            prep_units = ''

        row = []
        #Batch ID
        row.append(batchid)
        #Term ID
        row.append(hiring_semester.semester.name)
        #Signed
        row.append(signed)
        #Benefits Indicator
        row.append(benefits)
        #Emplid
        row.append(c.person.emplid)
        #SIN
        row.append(c.sin)
        #Name
        row.extend(
            [c.person.last_name, c.person.first_name, c.person.middle_name])
        #Payroll Start Date, Payroll End Date
        row.append(c.pay_start.strftime("%Y%m%d"))
        row.append(c.pay_end.strftime("%Y%m%d"))
        #Action, Action Reason
        row.append('REH')
        row.append('REH')
        #Position Number
        row.append("%08i" % c.category.account.position_number)
        #Job Code
        row.append('')
        #Full_Part time
        row.append('')
        #Pay Group
        row.append('TSU')
        #Employee Class
        row.append('')
        #Category
        row.append(c.category.code)
        #Fund
        row.append(11)
        #Dept ID(cost center)
        row.append(hiring_semester.unit.deptid())
        #Project
        row.append('')
        #Account
        row.append(c.category.account.account_number)
        #Prep Units
        row.append(prep_units)
        #Base Units
        row.append(bu)
        #Appt Comp Freq
        row.append('T')
        #Semester Base Salary Rate
        row.append("%2f" % (salary_total, ))
        #Biweekly Base Salary Rate, Hourly Rate, Standard Hours
        row.extend(['', '', ''])
        #Scholarhip Rate Code
        row.append(schol_rate)
        #Semester Scholarship Salary Pay Rate
        row.append(schol_total)
        #Biweekly Scholarship Salary Pay Rate, Lump Sum Amount
        #Lump Sum Hours, Scholarship Lump Sum
        row.extend(['', '', '', ''])

        writer.writerow(row)

    return response
 def _save_id_map(self):
     import unicodecsv
     writer = unicodecsv.writer(open(self.id_map_file, 'wb'))
     for key, value in self.id_map.items():
         writer.writerow([value, key])
Example #47
0
def write_data(file_data, file_name):
    with open(file_name, 'a') as f:
        f_csv = unicodecsv.writer(f, encoding='utf-8-sig')
        f_csv.writerows(file_data)
    print 'output finished!'
def main():
    home_directory = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__)))
    with open(
            os.path.join(home_directory,
                         'extracted_ocr\\Salem_Athenaeum_1811.html'),
            'r') as f:
        contents = f.read()
        file = html.unescape(bs(contents, 'lxml'))
        catalog = []
        final_catalog = []
        sum_of_heights = 0
        volume_sizes = [
            'folio', 'fol', 'quarto', '4to', 'octavo', 'oct', '8vo',
            'duodecimo', 'duodec', '12mo'
        ]
        for index, page in enumerate(file.find_all('page')):
            #Counting the words that go into the denominator for average word height
            counted_words = 0
            words = page.find_all('word')
            if len(words) == 0:
                continue
            words.sort(key=lambda x: float(x.get('xmin')))
            # First we sorted all of the words on the page from furthest left to furthest right.
            # Now we sort them all again by highest to lowest. Assuming one column of text per page,
            # this does an excellent job of putting all of the words in normal reading order.
            # Will need a different method for catalogs with two columns per page.
            words.sort(key=lambda x: float(x.get('ymin')))
            old_line_y = float(words[0].get('ymin'))
            line = []
            for word in words:
                # Ignoring empty "words." Not sure where they're coming from; this started happening
                # when I switched to ASCII encoding from the problematic-for-other-reasons UTF-8
                if word.text == '':
                    continue
                # Ignoring single lower case letters. These tend to be OCR artifacts that aren't useful
                if re.match("[a-z]{1}$", word.text):
                    continue
                # Ignoring et cetera
                if word.text == '&c':
                    continue
                # Ignoring things in all caps because they're typically headers
                # Even the catalogs where they're not necessarily (e.g., Ladies' Lib
                # of Kalamazoo), the OCR usually makes the all caps in running text
                # into normal text.
                if re.match("[A-Z]{2}", word.text):
                    continue
                # Needed for Lib Co Boston 1830, where a lot of 1s got turned into Is in
                # the shelf number column
                # if word.text == 'I':
                #    continue
                # Ignoring random flecks on the page that get turned into punctuation by OCR
                # Combined with number screen below.
                #if re.match("[_., %*:|'\"\^\-º“]$", word.text):
                #    continue
                # Ignoring common column headings. "Mo." is a common OCR error for an italicized "No."
                if re.match("Vol", word.text) or re.match(
                        "Wols",
                        word.text) or re.match("No.", word.text) or re.match(
                            "Mo.", word.text) or re.match("Size", word.text):
                    continue
                # "Shelf" is such a pain it gets a line of its own.
                if edit_distance("shelf",
                                 word.text.rstrip('.,?!:;').casefold()) < 3:
                    continue
                # Ignoring page numbers -- 1, 2, or 3-digit numbers not followed by "nd", "rd", "th", etc.
                # Can't do that for Charleston, because we need the numbers to calculate indents correctly.
                # Can also adjust to ignore shelf numbers when needed (e.g., NY Mechanics 1844)
                # And a tweak to ignore OCR-eaten numbers.
                # I swear that not all of this punctuation is in the ASCII code space, but I've seen
                # all of it in the OCR extracted by pdftotext with ASCII7 encoding....
                # Had to remove * from the punctuation list for NY Society 1813
                if re.match("[=%_.,:;#|*'\"\^\-º•§“{}\[\]&]{1,5}$",
                            word.text) or re.search("[0-9]{3,4}", word.text):
                    # if re.match("[0-9]{1,4}$", word.text):
                    continue
                counted_words += 1
                sum_of_heights = sum_of_heights + (float(word.get('ymax')) -
                                                   float(word.get('ymin')))
                line_y = float(word.get('ymin'))
                # We know we're on a new line of text when the ymin increases more than 7 pixels.
                # 7 pixels was selected empirically based on first several catalogs processed.
                # This may be too large of a number for very small-type catalogs.
                # Changed to 8 because found cases in Milwaukee YMA where 7 was too small.
                # Changed to 12 because of non-straight lines in Charleston Lib Co.
                if (line_y - 8) > old_line_y:
                    old_line_y = line_y
                    if line:
                        line.sort(key=lambda x: float(x.get('xmin')))
                        catalog.append(line)
                    line = [word]
                else:
                    line.append(word)
            #Append the last one on the page
            if len(line) > 0:
                line.sort(key=lambda x: float(x.get('xmin')))
                catalog.append(line)
            #Process the page, putting together split lines into single entries
            previous_line_xmin = None
            for entry in catalog:
                # Is the new line indented further than the old line? If so,
                # it needs some special handling.
                # If it's the first line on the page, the question is moot.
                if previous_line_xmin == None:
                    indent = 0
                else:
                    first_real_word = next(
                        (y for y in entry if y.text.rstrip('.,?!') != "do"),
                        None)
                    if first_real_word:
                        this_line_xmin = float(first_real_word.get('xmin'))
                        indent = previous_line_xmin - this_line_xmin
                    else:
                        continue
                # If it's a big indent, we want to carry down.
                if (indent + 25) < 0:
                    # If this line is indented and not continuing previous line,
                    # we want to append to this line everything from
                    # the previous line with an xmin smaller than the xmin of this word.
                    # Since this will carry down all relevant information from the words on the
                    # previous lines, this *should* work even in catalogs with multiple
                    # levels of indents.
                    # 'vocable' because 'word' was already taken in this script
                    # 10 pixels for slop again
                    for vocable in final_catalog[-1]:
                        if (float(vocable.get('xmin')) + 10) < this_line_xmin:
                            entry.append(vocable)
                    # sort it again because we screwed up the sort appending more words to it
                    entry = sorted(entry, key=lambda x: float(x.get('xmin')))
                # If it's a little indent, it's continuing the line above.
                elif (indent + 5) < 0:
                    # Try to reassemble hyphenated words. OCR process ate
                    # the hyphens at the end of lines, so we have to guess
                    # if two words go together or not. We'll assume that if the
                    # first word on the second line is not capitalized and is not
                    # recognized by the spellchecker then it should be concatenated
                    # with the last word on the previous line.
                    last_word_of_carryover = final_catalog[-1][-1].string
                    first_word_of_line = entry[0].string
                    if first_word_of_line and (
                            re.match("[A-Z]", first_word_of_line)
                            or re.match("[0-9 \-.,]+", first_word_of_line) or
                        (first_word_of_line.rstrip('.,?!').casefold()
                         in SpellChecker()
                         and last_word_of_carryover.rstrip('.,?!').casefold()
                         in SpellChecker())):
                        final_catalog[-1] += entry
                        final_catalog_sorted = sorted(
                            final_catalog[-1],
                            key=lambda x: float(x.get('xmin')))
                        previous_line_xmin = float(
                            final_catalog_sorted[0].get('xmin'))
                        entry = None
                    else:
                        final_catalog[-1][-1].string = final_catalog[-1][
                            -1].text + entry[0].text
                        del entry[0]
                        final_catalog[-1] += entry
                        final_catalog_sorted = sorted(
                            final_catalog[-1],
                            key=lambda x: float(x.get('xmin')))
                        previous_line_xmin = float(
                            final_catalog_sorted[0].get('xmin'))
                        entry = None
                if entry:
                    previous_line_xmin = float(entry[0].get('xmin'))
                    final_catalog.append(entry)
            catalog = []
    average_line_height = sum_of_heights / counted_words
    #average_line_height = sum_of_heights / len(file.find_all('word'))
    with open(
            os.path.join(home_directory,
                         'replication\\Salem_Athenaeum_1811.csv'),
            'wb+') as outfile:
        csvwriter = unicodecsv.writer(outfile, encoding='utf-8')
        for item in final_catalog:
            # Write catalog entries out to a CSV, omitting shelf numbers and sizes
            # Also omit headers, to the extent we can identify them by
            # having a line-height more than 20% larger than average
            # (testing this just on 1st word in line)
            if (float(item[0].get('ymax')) - float(item[0].get('ymin')) <
                    average_line_height * 1.2):
                final_entry = ''
                for vocable in item:
                    # Don't want to include shelf numbers etc. in output.
                    # Do want to include "1st", 12th", etc.
                    # Also want to get rid of "do"s now that we're done w/them
                    # Get rid of punctuation that confuses Solr (including commas and periods, which interact badly with the
                    # fuzzy search ~ when they trail a word). This includes "'s" on the end of words.
                    stripped_vocable = re.sub(
                        r'[\+ \- & \| ! , \. ( ) \{ \} \[ \] \^ " ~ \* \? : \\ #”`]',
                        '', vocable.text)
                    if stripped_vocable == '':
                        continue
                    # "o" because we need to keep volume sizes for a future step to work
                    if stripped_vocable[0].isalpha(
                    ) or stripped_vocable[-1] == 'h' or stripped_vocable[
                            -1] == 'd' or stripped_vocable[
                                -1] == 't' or stripped_vocable[-1] == 'o':
                        stripped_vocable = stripped_vocable.replace("'s", '')
                        # Don't want to attach donor info from ends of entries.
                        if stripped_vocable == 'Gift':
                            break
                        # For NY Apprentice's Lib and NY Society 1813, everything after the volume size is junk, so we're going to stop
                        # adding words to final_entry when we hit one of those.
                        if stripped_vocable == 'vol' or stripped_vocable.casefold(
                        ) in volume_sizes:
                            break
                        #if stripped_vocable != 'do' and stripped_vocable != 'ditto':
                        final_entry += ' ' + stripped_vocable
            # This is for NY Society 1813 and NY Mercantile 1825. These translation notes break matching every time.
            # Not stripping the word "translated" here (which precedes these phrases in NY Merc 1825) because it's
            # in the list of words to ignore in the Solr matching script already.
            # final_entry = final_entry.replace("from the Latin","")
            # final_entry = final_entry.replace("from the French","")
            # final_entry = final_entry.replace("from the German","")
                csvwriter.writerow([final_entry])
Example #49
0
import warc
import sys
import unicodecsv as csv

filetypes = ['.CSV', '.XLS', '.XLSX', '.JSON', '.RDF', '.ZIP']
geofiletypes = ('.GEOJSON', '.GML', '.GPX', '.GJSON', '.TIFF', '.SHP', '.KML',
                '.KMZ', '.WMS', '.WFS')
filetypes.extend(geofiletypes)

csvoutfile = open(sys.argv[1] + '.data.csv', 'a+b')
datawriter = csv.writer(csvoutfile, delimiter=',')

columns = [
    'Stadt_URL', 'URL_Datei', 'URL_Text', 'URL_Dateiname', 'Format', 'geo',
    'URL_PARENT', 'Title_Parent'
]

datawriter.writerow(columns)

f = warc.open(sys.argv[2])
domain = sys.argv[1]
blacklist = ('.jpg', '.gif', '.ico', '.txt', '.pdf', '.png', 'dns:', '.css',
             '.js')

for record in f:
    if ('WARC-Target-URI' in record.header) and (
            domain in record['WARC-Target-URI']) and not any(
                x in record['WARC-Target-URI']
                for x in blacklist) and 'metadata' in record['warc-type']:
        #for item in record.__dict__['header'].items():
        #print item
Example #50
0
def claro_sva_start():

    import MySQLdb
    import MySQLdb.cursors
    import unicodecsv as csv
    import gzip
    import urlparse
    from datetime import datetime, timedelta

    from tcm import app

    uri = urlparse.urlparse(app.config['SQLALCHEMY_DATABASE_URI'])

    conn = MySQLdb.connect(host=uri.hostname,
                           port=uri.port or 3306,
                           user=uri.username,
                           passwd=uri.password,
                           db=uri.path.lstrip('/'),
                           cursorclass=MySQLdb.cursors.SSCursor)

    dt = datetime.today()
    td = datetime.today() - timedelta(days=1)
    c = conn.cursor()
    c.execute(
        """SELECT C.created, C.id_product, CH.msisdn, C.activated, C.medium,
              CH.charged, CH.value, C.cancelled, CH.charged, CH.updated, C.msisdn,
              C.id_package, CH.code
              FROM contract_manager.charge as CH
              JOIN contract_manager.contract as C ON C.id = CH.id_contract
              WHERE C.id_product in (44, 45, 52)
              and CH.charged between '{}' and '{}';""".format(td, dt))

    date = td.strftime("%Y%m%d%H%M00")
    filename = 'SVA_10_CLARO_START_{date}.txt.gz'.format(date=date)
    with gzip.open(filename, 'w') as f:
        writer = csv.writer(f, delimiter='|')
        writer.writerow([
            "Data_Criacao_Registro_Origem", "Hora_Criacao_Registro_Origem",
            "Codigo_servico", "Codigo_Categoria", "Numero_telefone",
            "Data_Ativacao", "Hora_Ativacao", "Codigo_canal", "Comando",
            "Data_Tarifacao", "Hora_Tarifacao", "Valor_Tarifacao",
            "codigo_assinatura", "Data_Cancelamento", "Hora_Cancelamento",
            "Data_Tentativa", "Hora_Tentativa", "Codigo_Status",
            "Data_Estimada_Churn", "Hora_Estimada_Churn",
            "Valor_Repasse_Parceiro", "Valor_Repasse_Claro", "Valor_Imposto",
            "Codigo_Parceiro", "codigo_cliente", "tipo_chave",
            "Codigo_Tipo_Serviço", "Duracao_Servico", "Codigo_detalhe_produto",
            "Codigo_Sharecode"
        ])

        for i, row in enumerate(c.fetchall(), 1):
            writer.writerow([
                '%s' % row[0].strftime("%Y%m%d %H:%M:%S"), '',
                '%s' % row[1], '',
                '%s' % row[2][2:],
                '%s' % row[3].strftime("%Y%m%d %H:%M:%S"), '',
                '%s' % row[4], '',
                '%s' % row[5].strftime("%Y%m%d %H:%M:%S"), '',
                '%s' % str(round(row[6], 2)).replace('.', ','), '',
                '%s' % row[7].strftime("%Y%m%d %H:%M:%S") if row[7] else '',
                '',
                '%s' % row[8].strftime("%Y%m%d %H:%M:%S") if row[8] else '',
                '%s' % row[9].strftime("%Y%m%d %H:%M:%S"), '', '', '', '40',
                '60', '', '10',
                '%s' % row[10][2:], '1', 'e', '',
                '%s' % row[11],
                '%s' % row[12]
            ])

    f.close()
    return {'total': i, 'filename': filename}
Example #51
0
def subClustering(t, k, id, num):
    lda_result_dir = LDA_DIR + str(t) + "/"
    kmean_result_dir = lda_result_dir + "KMEAN_" + str(k) + "/"
    assignment_file = kmean_result_dir + "assignment_sorted.csv"

    kmean_result_subdir = kmean_result_dir + "sub/"
    docs_id = []

    with open(assignment_file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, encoding='utf-8')
        for row in spamreader:
            if row[0] == id:
                docs_id.append(row[1])

    docs = []
    distribution_file = lda_result_dir + 'document-topic-distribution.csv'
    with open(distribution_file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, encoding='utf-8')
        next(spamreader)
        i = 0
        for row in spamreader:
            if row[0] in docs_id:
                docs.append(row)
                i = i + 1

    print docs

    result = numpy.array(docs).astype('float')

    kmeans = KMeans(n_clusters=num,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    tol=0.0001,
                    precompute_distances='auto',
                    verbose=0,
                    random_state=None,
                    copy_x=True,
                    n_jobs=1)
    kmeans.fit(result[:, 1:])

    label = str(t) + '_' + str(k) + '_' + str(num)

    centerWriter = csv.writer(open(
        kmean_result_subdir + 'subcenters_' + label + '.csv', 'wb'),
                              encoding='utf-8')
    center_list = []
    for i, center in enumerate(kmeans.cluster_centers_):
        row = [int(i)]
        center_list.append([])
        for value in center:
            row.append(value)
        centerWriter.writerow(row)

    labels = kmeans.labels_
    assignmentWriter = csv.writer(open(
        kmean_result_subdir + 'assignment' + label + '.csv', 'wb'),
                                  encoding='utf-8')

    for i, index in enumerate(result[:, 0]):
        dist = numpy.linalg.norm(result[i, 1:] -
                                 kmeans.cluster_centers_[labels[i]])
        center_list[labels[i]].append((int(index), dist))
        assignmentWriter.writerow([int(index), labels[i], dist])

    sortedWriter = csv.writer(open(
        kmean_result_subdir + 'assignment_sorted' + label + '.csv', 'wb'),
                              encoding='utf-8')

    for i, d in enumerate(center_list):
        sorted_d = sorted(d, key=lambda x: x[1])
        for item in sorted_d:
            sortedWriter.writerow([int(i), item[0], item[1]])
import unicodecsv
import requests
from BeautifulSoup import BeautifulSoup

headers = ['county', 'precinct', 'office', 'district', 'party', 'candidate', 'votes']

offices = ['United States President', 'US Senator', 'US Representative', 'State Representative',
'Governor', 'State Treasurer', 'Attorney General', 'Secretary of State', 'United States President and VP']

office_lookup = {
    'US Senator' : 'U.S. Senate', 'US Representative' : 'U.S. House', 'Governor' : 'Governor', 'State Senator' : 'State Senate',
    'State Representative' : 'State House', 'State Representative, 58th District': 'State House'
}

with open('20141104__or__general__union__precinct.csv', 'wb') as csvfile:
    w = unicodecsv.writer(csvfile, encoding='utf-8')
    w.writerow(headers)

    r = requests.get('http://union-county.org/wp-content/uploads/2013/08/EL301.htm')
    soup = BeautifulSoup(r.text)
    lines = soup.find('pre').text.split('\r\n')
    keys = []
    for line in lines:
        if line.strip() == '\n':
            continue
        if "PRECINCT REPORT       Union County, Oregon" in line:
            continue
        if line.strip() == '':
            continue
        if 'Run Date:' in line:
            continue
Example #53
0
def KMeanClustering():
    for numOfTopic in NUM_OF_TOPIC:
        lda_result_dir = LDA_DIR + str(numOfTopic) + "/"
        distribution_file = lda_result_dir + 'document-topic-distribution.csv'
        for num in NUM_OF_CLUSTER:
            kmean_result_dir = lda_result_dir + "KMEAN_" + str(num) + "/"
            if not path.exists(kmean_result_dir):
                makedirs(kmean_result_dir)

            reader = csv.reader(open(distribution_file, "rb"),
                                encoding='utf-8')
            next(reader, None)
            x = list(reader)
            result = numpy.array(x).astype('float')
            kmeans = KMeans(n_clusters=num,
                            init='k-means++',
                            n_init=10,
                            max_iter=300,
                            tol=0.0001,
                            precompute_distances='auto',
                            verbose=0,
                            random_state=None,
                            copy_x=True,
                            n_jobs=1)
            kmeans.fit(result[:, 1:])

            centerWriter = csv.writer(open(kmean_result_dir + 'centers.csv',
                                           'wb'),
                                      encoding='utf-8')

            header = []
            header.append("id")
            for i in range(numOfTopic):
                header.append("Topic " + str(i))
            centerWriter.writerow(header)

            center_list = []
            for i, center in enumerate(kmeans.cluster_centers_):
                row = [int(i)]
                center_list.append([])
                for value in center:
                    row.append(value)
                centerWriter.writerow(row)

            labels = kmeans.labels_
            assignmentWriter = csv.writer(open(
                kmean_result_dir + 'assignment.csv', 'wb'),
                                          encoding='utf-8')

            for i, index in enumerate(result[:, 0]):
                dist = numpy.linalg.norm(result[i, 1:] -
                                         kmeans.cluster_centers_[labels[i]])
                center_list[labels[i]].append((int(index), dist))
                assignmentWriter.writerow([int(index), labels[i], dist])

            sortedWriter = csv.writer(open(
                kmean_result_dir + 'assignment_sorted.csv', 'wb'),
                                      encoding='utf-8')
            docsWriter = csv.writer(open(
                kmean_result_dir + 'assignment_sorted_doc.csv', 'wb'),
                                    encoding='utf-8')

            for i, d in enumerate(center_list):
                sorted_d = sorted(d, key=lambda x: x[1])
                count = 0
                for item in sorted_d:
                    sortedWriter.writerow([int(i), item[0], item[1]])

                    if count < 20:
                        docsWriter.writerow([int(i), item[0], item[1]])
                    count = count + 1
Example #54
0
    def do(self):
        order_round = get_current_order_round()
        print(("Order round: %s" % order_round))
        if order_round.is_open:
            print("Order round is closed")
            return

        if order_round.is_not_open_yet():
            print("Order round is not open yet")
            return

        if order_round.order_placed:
            print("Order has already been placed")
            return

        # To prevent mail loops
        order_round.order_placed = True
        order_round.save()

        for supplier in Supplier.objects.all():
            if not supplier.has_orders_in_current_order_round():
                print(("No orders for supplier %s" % supplier))
                log_event(event="Supplier %s has no orders in current round, "
                          "so not sending order list." % supplier)
                continue

            print(("Generating lists for supplier %s" % supplier))

            # Generate CSV
            in_mem_file = io.BytesIO()
            csv_writer = csv.writer(in_mem_file, delimiter=';', quotechar='|')

            # Write header row
            csv_writer.writerow([
                "Aantal", "Eenheid", "Product", "Omschrijving",
                "Inkoopprijs Euro", "Subtotaal"
            ])

            ordered_products = supplier.products. \
                exclude(orderproducts=None). \
                filter(orderproducts__order__paid=True). \
                filter(order_round=order_round). \
                annotate(amount_sum=Sum('orderproducts__amount'))

            if not ordered_products:
                log_event(event="Supplier %s has only STOCK orders "
                          "in current round, "
                          "so not sending order list." % supplier)
                continue

            # Write products and amounts
            for obj in ordered_products:
                csv_writer.writerow([
                    fix_decimal_separator(obj.amount_sum),
                    obj.unit_of_measurement,
                    obj.name,
                    obj.description.replace("\n", " ").replace("\r", " "),
                    # TODO make sure newlines aren't stored
                    # in our products
                    fix_decimal_separator(obj.base_price),
                    fix_decimal_separator(obj.amount_sum * obj.base_price)
                ])

            # Write 'total' row
            total = fix_decimal_separator(
                sum([
                    obj.amount_sum * obj.base_price for obj in ordered_products
                ]))
            csv_writer.writerow([])
            csv_writer.writerow(["TOTAAL", "", "", "", "", total])

            in_mem_file.seek(0)
            in_mem_file.seek(0)  # Why is this here twice?

            # Generate mail
            subject = ('VOKO Utrecht - Bestellijst voor %s' %
                       order_round.collect_datetime.strftime("%d %B %Y"))
            from_email = 'VOKO Utrecht Boerencontact <*****@*****.**>'
            to = '%s <%s>' % (supplier.name, supplier.email)

            text_content = """
Hoi %s,

Hierbij sturen we onze bestelling voor bestelronde %d
voor ons uitlevermoment op %s.

De bestelling is in CSV-formaat, dit is te openen in bijvoorbeeld Excel.

Dit is een geautomatiseerd bericht, maar reageren is gewoon mogelijk.

Vriendelijke groeten,
VOKO Utrecht
""" % (supplier, order_round.pk,
            order_round.collect_datetime.strftime("%d %B %Y"))

            msg = EmailMultiAlternatives(
                subject,
                text_content,
                from_email, [to],
                cc=["VOKO Utrecht Boerencontact <*****@*****.**>"])

            msg.attach('voko_utrecht_bestelling_ronde_%d.csv' % order_round.pk,
                       in_mem_file.read().decode("utf-8"), 'text/csv')
            msg.send()
Example #55
0
def LDA():
    files = [
        f for f in listdir(LDA_DIR)
        if isfile(join(LDA_DIR, f)) and "docs" in f and ".csv" in f
    ]

    texts = []
    index = []

    for file in files:
        print file
        with open(LDA_DIR + file, 'rb') as csvfile:
            spamreader = csv.reader(csvfile, encoding='utf-8')
            for row in spamreader:
                doc = row[3].lower()
                tokens = tokenizer.tokenize(doc)
                stopped_tokens = [i for i in tokens if not i in en_stop]
                stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
                if len(stemmed_tokens) > 2:
                    texts.append(stemmed_tokens)
                    index.append(row[0])

    dictionary = corpora.Dictionary(texts)
    dictionary.save()
    corpus = [dictionary.doc2bow(text) for text in texts]
    print len(corpus)

    for num in NUM_OF_TOPIC:
        lda_result_dir = LDA_DIR + str(num) + "/"
        if not path.exists(lda_result_dir):
            makedirs(lda_result_dir)

        if isfile(join(lda_result_dir, "model.model")):
            ldamodel = models.ldamodel.LdaModel.load(join(
                lda_result_dir, "model.model"),
                                                     mmap='r')
        else:
            ldamodel = models.ldamodel.LdaModel(corpus,
                                                num_topics=num,
                                                id2word=dictionary,
                                                passes=20)
            ldamodel.save(lda_result_dir + "model.model")

        print "-------------", ldamodel.alpha
        textWriter = csv.writer(open(lda_result_dir + 'topic.csv', 'wb'),
                                encoding='utf-8')
        for i in range(num):
            textWriter.writerow(["Topic " + str(i)])
            for word in ldamodel.show_topic(i, 20):
                textWriter.writerow([word[0], word[1]])
            textWriter.writerow([])

        disWriter = csv.writer(open(
            lda_result_dir + 'document-topic-distribution.csv', 'wb'),
                               encoding='utf-8')
        header = []
        header.append("id")
        for i in range(num):
            header.append("Topic " + str(i))
        disWriter.writerow(header)

        for i, bow in enumerate(corpus):
            distribution = [None] * (num + 1)
            distribution[0] = index[i]
            for dis in ldamodel.get_document_topics(bow,
                                                    minimum_probability=0):
                distribution[dis[0] + 1] = dis[1]
            disWriter.writerow(distribution)
Example #56
0
import unicodecsv as csv

API_KEY = 'vhqTHd5SvnkydORd2kwPa1CPUgN8bzHwNvTQu7KUdUgxGh88eF'
API_URL = 'http://api.tumblr.com/v2/blog/%s/posts?api_key=%s&offset=%d'
TUMBLR_NAME = 'wearethe99percent.tumblr.com'

r = requests.get(API_URL % (TUMBLR_NAME, API_KEY, 0))

json = simplejson.loads(r.content)

total_posts = json['response']['total_posts']

offset = 0

csv_file = open('tumblr.csv', 'w')
writer = csv.writer(csv_file)

writer.writerow(('id', 'post_url', 'timestamp', 'datetime', 'tags', 'body'))

while offset < total_posts:
    print 'Fetching with offset %d' % offset
    r = requests.get(API_URL % (TUMBLR_NAME, API_KEY, offset))
    offset += 20
    json = simplejson.loads(r.content)
    posts = json['response']['posts']
    for post in posts:
        if post['type'] == 'text':
            body = post['body']
        elif post['type'] == 'photo':
            body = post['caption']
        else:
Example #57
0
 def write_to_csv(self, item):
     csv_write = unicodecsv.writer(self.csvfile,
                                   encoding='utf-8-sig',
                                   dialect='excel')
     csv_write.writerow(item)
from kafka import KafkaConsumer
import json
import unicodecsv as  csv
from ast import literal_eval
import os.path
from os import path
# Connect to a particular topic and decode the messages

consumer = KafkaConsumer('Histcounts_Month',bootstrap_servers=['0.0.0.0:9092'],value_deserializer=lambda v: json.loads(v).decode('utf-8'))
if (path.exists("histclientcounts_month.csv")) == False:
    f = open("histclientcounts_month.csv","a")
    fWriter = csv.writer(f)
    #f.write('\n')
# write the header of the csv
    fWriter.writerow(['DisplayName','id','CollectionTime','Subkey','type','MAC'])
    f.close()

# flag for spacing
a = 1
flag = False
f = open("histclientcounts_month.csv","a")
f.write('\n')
f.close()
# loop for messages
for msg in consumer:
    
    
    f = open("histclientcounts_month.csv","a")
    # new line after 6 entries
    if (a-1) % 6 == 0 and flag == True:
        f.write('\n')
Example #59
0
    def execute(self, context):
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        tmpfile = None
        result = None

        selected_columns = []

        count = 0
        with closing(vertica.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(self.sql)
                selected_columns = [d.name for d in cursor.description]

                if self.bulk_load:
                    tmpfile = NamedTemporaryFile("w")

                    self.log.info(
                        "Selecting rows from Vertica to local file %s...",
                        tmpfile.name)
                    self.log.info(self.sql)

                    csv_writer = csv.writer(tmpfile,
                                            delimiter='\t',
                                            encoding='utf-8')
                    for row in cursor.iterate():
                        csv_writer.writerow(row)
                        count += 1

                    tmpfile.flush()
                else:
                    self.log.info("Selecting rows from Vertica...")
                    self.log.info(self.sql)

                    result = cursor.fetchall()
                    count = len(result)

                self.log.info("Selected rows from Vertica %s", count)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator...")
            mysql.run(self.mysql_preoperator)

        try:
            if self.bulk_load:
                self.log.info("Bulk inserting rows into MySQL...")
                with closing(mysql.get_conn()) as conn:
                    with closing(conn.cursor()) as cursor:
                        cursor.execute(
                            "LOAD DATA LOCAL INFILE '%s' INTO "
                            "TABLE %s LINES TERMINATED BY '\r\n' (%s)" %
                            (tmpfile.name, self.mysql_table,
                             ", ".join(selected_columns)))
                        conn.commit()
                tmpfile.close()
            else:
                self.log.info("Inserting rows into MySQL...")
                mysql.insert_rows(table=self.mysql_table,
                                  rows=result,
                                  target_fields=selected_columns)
            self.log.info("Inserted rows into MySQL %s", count)
        except (MySQLdb.Error, MySQLdb.Warning):
            self.log.info("Inserted rows into MySQL 0")
            raise

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator...")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done")
Example #60
0
@retry(wait_random_min=5000, wait_random_max=10000)
def parse_page(url):
    print url
    token = str(random.randint(100000, 999999))
    cat_page = html.parse(base + url)
    pages = cat_page.xpath('//select[@name="page"]/option')
    pages = int(pages[-1].attrib["value"]) if pages else 0
    for thread in cat_page.xpath('//ul[@class="thread-list"]/li//h3/a'):
        scrape_thread(thread)
    for page in xrange(1, pages):
        cat_page = html.parse(base + url + "?p_token=" + token + "&page=" +
                              str(page))
        for thread in cat_page.xpath('//ul[@class="thread-list"]/li//h3/a'):
            scrape_thread(thread)


base = "http://patient.info"
start = html.parse("http://patient.info/forums")
f = open('patient.csv', 'w')
w = unicodecsv.writer(f, encoding='utf-8', lineterminator='\n')
w.writerow([
    'uniqueID', 'qid', 'localID', 'title', 'poster', 'date', 'replyTo',
    'content', 'infered_replies', 'category'
], )

for letter in string.lowercase:
    for cat in html.parse(base + "/forums/index-" +
                          letter).xpath('//table[@class="zebra-table"]//a'):
        subforum = cat.text
        parse_page(cat.attrib["href"])