def getReader(uploadfile):
	file = uploadfile.file
	extension = os.path.splitext(uploadfile.filename)[1]

	csvreader = None
	# make sure to convert excel files
	if extension == '.xls':
		file = StringIO.StringIO(xls2csv(file))
		csvreader = reader(file)
	else:
		dialect = sniffer.sniff_dialect(file.read(4096))
		file.seek(0)
		csvreader = reader(file, dialect=dialect)
	return csvreader
    def build_post_level_report(self, f):
        PERMALINK = 1
        MESSAGE = 2
        POST_DATE = 6
        TOTAL_REACH = 7
        IMPRESSIONS = 10
        post_data = list()

        csv = csvkit.reader(f)
        csv.next()
        csv.next()
        for idx, r in enumerate(csv):
            post_data.append(
                {
                    "permalink": r[PERMALINK],
                    "pubdate": datetime.datetime.strptime(r[POST_DATE], "%m/%d/%Y %H:%M:%S %p"),
                    "message": r[MESSAGE],
                    "totalreach": safe_cast(r[TOTAL_REACH], int, 0),
                    "impressions": safe_cast(r[IMPRESSIONS], int, 0),
                }
            )

        start_date = min(post_data, key=lambda i: i["pubdate"])["pubdate"]
        end_date = max(post_data, key=lambda i: i["pubdate"])["pubdate"]

        sorted_data = sorted(post_data, key=lambda i: i["totalreach"], reverse=True)
        top_posts = sorted_data[:10]
        sorted_data.reverse()
        bottom_posts = sorted_data[:10]

        return {"start_date": start_date, "end_date": end_date, "top_posts": top_posts, "bottom_posts": bottom_posts}
Esempio n. 3
0
def importaCandidatos(arquivo):
	print "Importando candidatos"
	raw = open(arquivo, 'r')
	raw = csvkit.reader(raw, encoding='iso-8859-1', delimiter=';')
	candidatos = {}
	lista = {}
	for c in raw:
		c[10] = unidecode.unidecode(c[10])
		if c[9] in ['GOVERNADOR', 'PRESIDENTE', 'DEPUTADO FEDERAL', 'SENADOR']:
			candidatos[c[26]] = {
				'nome' : c[10],
				'apelidos' : [c[13]],
				'_id' : c[26],
				'candidaturas' : {},
				'mugshot' : c[11]
			}
			candidatos[c[26]]['candidaturas']['2014'] = {
				'cargo' : c[9],
				'situacao' : c[15],
				'numero' : c[12],
				'partido' : c[17],
				'uf' : c[5],
				'doacoes' : {},
				'total' : 0,
			}

		# Salva lista
		if c[9] not in ['REMOVER']:
				lista[c[10]] = 0
		if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos
				lista[c[13]] = c[10]
	mongo_save(candidatos, 'politicos', True)
	with open('names.js', 'w') as final:
		header ="var nick = "
		final.write(header+json.dumps(lista))
Esempio n. 4
0
    def from_csv(cls, path, column_info, header=True, **kwargs):
        """
        Create a new table for a CSV. This method will use csvkit if it is
        available, otherwise it will use Python's builtin csv module.

        ``kwargs`` will be passed through to :meth:`csv.reader`.

        If you are using Python 2 and not using csvkit, this method is not
        unicode-safe.

        :param path: Path to the CSV file to read from.
        :param column_info: See :class:`.Table` constructor.
        :param header: If `True`, the first row of the CSV is assumed to contains
            headers and will be skipped.
        """
        with open(path) as f:
            rows = list(csv.reader(f, **kwargs))

        if header:
            column_names = rows.pop(0)

        if len(column_names) != len(column_info):
            # TKTK Better Error
            raise ValueError

        return Table(rows, column_info)
Esempio n. 5
0
    def from_csv(cls, path, column_info=None, row_names=None, header=True, **kwargs):
        """
        Create a new table for a CSV. This method will use csvkit if it is
        available, otherwise it will use Python's builtin csv module.

        ``kwargs`` will be passed through to :meth:`csv.reader`.

        If you are using Python 2 and not using csvkit, this method is not
        unicode-safe.

        :param path:
            Filepath or file-like object from which to read CSV data.
        :param column_info:
            May be any valid input to :meth:`Table.__init__` or
            an instance of :class:`.TypeTester`. Or, None, in which case a
            generic :class:`.TypeTester` will be created.
        :param row_names:
            See :meth:`Table.__init__`.
        :param header:
            If `True`, the first row of the CSV is assumed to contains
            headers and will be skipped.
        """
        if column_info is None:
            column_info = TypeTester()

        use_inference = isinstance(column_info, TypeTester)

        if hasattr(path, 'read'):
            rows = list(csv.reader(path, **kwargs))
        else:
            with open(path) as f:
                rows = list(csv.reader(f, **kwargs))

        if header:
            column_names = rows.pop(0)
        else:
            column_names = [None] * len(rows[0])

        if use_inference:
            column_info = column_info.run(rows, column_names)
        else:
            if len(column_names) != len(column_info):
                # TKTK Better Error
                raise ValueError('CSV contains more columns than were specified.')

        return Table(rows, column_info, row_names=row_names)
def analyze_fb_page_data(page_file):
    print("Analying Facebook Page file {0}".format(page_file))
    DATE = 0
    TOTAL_LIKES = 1
    ENGAGED_USERS = 6
    TOTAL_REACH = 26
    IMPRESSIONS = 35

    data = list()
    with open(page_file) as f:
        csv = csvkit.reader(f)
        # skip first 2 lines
        csv.next()
        csv.next()
        for idx, r in enumerate(csv):
            data.append({
                'date': datetime.datetime.strptime(r[DATE], "%Y-%m-%d"),
                'likes': safe_cast(r[TOTAL_LIKES], int, 0),
                'engaged_users': safe_cast(r[ENGAGED_USERS], int, 0),
                'reach': safe_cast(r[TOTAL_REACH], int, 0),
                'impressions': safe_cast(r[IMPRESSIONS], int, 0)
            })

    start_date = min(data, key=lambda i: i['date'])['date']
    end_date = max(data, key=lambda i: i['date'])['date']

    print "Start Date: ", start_date
    print "End Date: ", end_date

    chart_labels = [datetime.datetime.strftime(x['date'], "%m/%d/%Y") for x in data]
    reach = [x['reach'] for x in data]
    impressions = [x['impressions'] for x in data]
    engaged = [x['engaged_users'] for x in data]
    likes = [x['likes'] for x in data]

    reach.insert(0, 'Reach')
    impressions.insert(0, 'Impressions')
    engaged.insert(0, 'Engaged Users')
    likes.insert(0, 'Page Likes')

    template = render_template(FB_PAGE_TMPL,
                               {'start_date': start_date,
                                'end_date': end_date,
                                'chart_labels': chart_labels,
                                'reach': reach,
                                'impressions': impressions,
                                'engaged': engaged,
                                'likes': likes})

    template = template.encode('utf-8')

    print("Generating FB Page Report {0}".format(FB_PAGE_REPORT_OUT_NAME))

    with open(FB_PAGE_REPORT_OUT_NAME, "wb") as outf:
        outf.write(template)
def processFile(column, filename):
    dataValues = [] # the data
    with open(filename, 'r') as csvData:
        csvReader = csvkit.reader(csvData, delimiter=',', quotechar='"', skipinitialspace=True)
        for row in csvReader:
            dataValues.append(int(row[column]))
    dataSeries = pd.Series(dataValues)
    dataDescription = dataSeries.describe()
    descriptonStr = [str(e) for e in dataDescription.tolist()]
    descriptonStr.insert(0, filename)
    print ", ". join(descriptonStr)
Esempio n. 8
0
    def from_csv(cls, path, column_info, header=True, **kwargs):
        """
        Create a new table for a CSV. This method will use csvkit if it is
        available, otherwise it will use Python's builtin csv module.

        ``kwargs`` will be passed through to :meth:`csv.reader`.

        If you are using Python 2 and not using csvkit, this method is not
        unicode-safe.

        :param path: Filepath or file-like object from which to read CSV data.
        :param column_info: A sequence of pairs of column names and types. The latter
            must be instances of :class:`.DataType`. Or, an instance of
            :class:`.TypeTester` to infer types.
        :param header: If `True`, the first row of the CSV is assumed to contains
            headers and will be skipped.
        """
        use_inference = isinstance(column_info, TypeTester)

        if use_inference and not header:
            raise ValueError("Can not apply TypeTester to a CSV without headers.")

        if hasattr(path, "read"):
            rows = list(csv.reader(path, **kwargs))
        else:
            with open(path) as f:
                rows = list(csv.reader(f, **kwargs))

        if header:
            column_names = rows.pop(0)

        if use_inference:
            column_info = column_info.run(rows, column_names)
        else:
            if len(column_names) != len(column_info):
                # TKTK Better Error
                raise ValueError("CSV contains more columns than were specified.")

        return Table(rows, column_info)
Esempio n. 9
0
    def test_writer_alias(self):
        output = six.StringIO()
        writer = csvkit.writer(output)
        writer.writerow(['a', 'b', 'c'])
        writer.writerow(['1', '2', '3'])
        writer.writerow(['4', '5', u'ʤ'])

        written = six.StringIO(output.getvalue())

        reader = csvkit.reader(written)
        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader), ['1', '2', '3'])
        self.assertEqual(next(reader), ['4', '5', u'ʤ'])
Esempio n. 10
0
    def test_writer_alias(self):
        output = six.StringIO()
        writer = csvkit.writer(output, encoding='utf-8')
        self.assertEqual(writer._eight_bit, True)
        writer.writerow(['a', 'b', 'c'])
        writer.writerow(['1', '2', '3'])
        writer.writerow(['4', '5', u'ʤ'])

        written = six.StringIO(output.getvalue())

        reader = csvkit.reader(written, encoding='utf-8')
        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader), ['1', '2', '3'])
        self.assertEqual(next(reader), ['4', '5', u'ʤ'])
Esempio n. 11
0
def processFile(column, filename):
    dataValues = []  # the data
    with open(filename, 'r') as csvData:
        csvReader = csvkit.reader(csvData,
                                  delimiter=',',
                                  quotechar='"',
                                  skipinitialspace=True)
        for row in csvReader:
            dataValues.append(int(row[column]))
    dataSeries = pd.Series(dataValues)
    dataDescription = dataSeries.describe()
    descriptonStr = [str(e) for e in dataDescription.tolist()]
    descriptonStr.insert(0, filename)
    print ", ".join(descriptonStr)
Esempio n. 12
0
    def test_writer_alias(self):
        output = six.StringIO()
        writer = csvkit.writer(output, encoding='utf-8')
        self.assertEqual(writer._eight_bit, True)
        writer.writerow(['a', 'b', 'c'])
        writer.writerow(['1', '2', '3'])
        writer.writerow(['4', '5', u'ʤ'])

        written = six.StringIO(output.getvalue())

        reader = csvkit.reader(written, encoding='utf-8')
        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader), ['1', '2', '3'])
        self.assertEqual(next(reader), ['4', '5', u'ʤ'])
def loadMasterCSVFile():
    """
    loads the durations of all eegs from an NK database derived csv file of eeg numbers with their associated start 
    and end times. 
    :return: 
    """
    with codecs.open(nkDurationFilename,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as cf:
        i = 0
        masterCSV = csv.reader(cf)
        masterCSV_list = [row for row in masterCSV]
        print(len(masterCSV_list))
        return masterCSV_list
def tsvTOcsv(input_file_name, out_file):
    """
    converts tsv formatted files to csv,
    used to make the csv file that is readable by this EEG report feature analyzer
    :param input_file_name: a tsv file
    :param out_file: a csv file
    :return: 
    """
    i = 0
    with open(input_file_name, 'rb') as tsvin, open(out_file, 'wb') as csvout:
        tsvin = csv.reader(tsvin, delimiter='\t')
        csvout = csv.writer(csvout)

        for row in tsvin:
            if len(row) > 0:
                csvout.writerow(row)
Esempio n. 15
0
def extract_data(
):  ## Extracting form txt or csv. it returns "exp_V" and "exp_I" lists
    exp_V = []
    exp_I = []
    line_num = 0
    run_path = os.getcwd()
    namefile = input("Enter the name of the file: ")
    typefile = input("Which type of data? [txt/csv]")
    header = int(input("How many lines need to be removed from the top? "))
    # namefile = "test"                                     #This block can be used to speed up during testing
    # typefile = "csv"
    # header = 0
    os.chdir(data_path)
    if typefile == "csv":
        with open(namefile + ".csv") as file:
            rowreader = csvkit.reader(file, delimiter=";")
            for row in rowreader:
                line_num += 1
                try:  #Verify if the this can act as a text removal.
                    exp_V.append(float(row[0]))
                    exp_I.append(float(row[1]))
                except:
                    print("Line " + str(line_num) + " contains NaNs")
            for i in (0, header):
                exp_V.pop(0)
                exp_I.pop(0)
        file.close()
    elif typefile == "txt":
        my_file = open(namefile + ".txt")
        data = my_file.read()
        my_file.close
        lines = data.split("\n")
        for line in lines:
            row = line.split("\t")
            line_num += 1
            try:
                exp_V.append(float(row[0]))
                exp_I.append(float(row[1]))
            except:
                print("Line " + str(line_num) + " contains NaNs")

    # voltage = np.array([volt])
    # current = np.array([curr])        # Might be necessary to perform a fitting. To be investigated.

    os.chdir(run_path)
    return exp_V, exp_I
    def build_page_level_report(self, f):
        DATE = 0
        TOTAL_LIKES = 1
        ENGAGED_USERS = 6
        TOTAL_REACH = 26
        IMPRESSIONS = 35

        data = list()

        csv = csvkit.reader(f)
        csv.next()
        csv.next()
        for idx, r in enumerate(csv):
            data.append(
                {
                    "date": datetime.datetime.strptime(r[DATE], "%Y-%m-%d"),
                    "likes": safe_cast(r[TOTAL_LIKES], int, 0),
                    "engaged_users": safe_cast(r[ENGAGED_USERS], int, 0),
                    "reach": safe_cast(r[TOTAL_REACH], int, 0),
                    "impressions": safe_cast(r[IMPRESSIONS], int, 0),
                }
            )

        start_date = min(data, key=lambda i: i["date"])["date"]
        end_date = max(data, key=lambda i: i["date"])["date"]

        chart_labels = [datetime.datetime.strftime(x["date"], "%m/%d/%Y") for x in data]
        reach = [x["reach"] for x in data]
        impressions = [x["impressions"] for x in data]
        engaged = [x["engaged_users"] for x in data]
        likes = [x["likes"] for x in data]

        reach.insert(0, "Reach")
        impressions.insert(0, "Impressions")
        engaged.insert(0, "Engaged Users")
        likes.insert(0, "Page Likes")

        return {
            "start_date": start_date,
            "end_date": end_date,
            "chart_labels": chart_labels,
            "reach": reach,
            "impressions": impressions,
            "engaged_users": engaged,
            "likes": likes,
        }
Esempio n. 17
0
def generateTSECand():
	'''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta.
	   http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip
	'''
	lista = {}
	ufs = ["AC", "AL", "AM", "AP",  "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO","BR"]
	for uf in ufs:
		print 'Getting '+uf
		cand = open("../raw/candidaturas2014/consulta_cand_2014_"+uf+".txt", 'r')
		cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';')
		for c in cand:
			#if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas
			#if c[9] not in ['REMOVER']:
			#	lista[unidecode.unidecode(c[10])] = 0
			if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos
				lista[unidecode.unidecode(c[10])] = 0
				lista[c[13]] = unidecode.unidecode(c[10])
	return lista
    def handle(self, *args, **options):
        # read in CSV
        print("This is an auto-generated Django model module \
            created by apps.core.commands.")
        print("from django.contrib.gis.db import models\n")

        with open(args[0], 'rb') as csvfile:

            reader = csvkit.reader(csvfile)
            headers = reader.next()
            print("class GeneratedModel(models.Model):")

            for row in headers:
                # take the row, slugify it
                # and replace the hyphens with underscores
                field = slugify(row).replace('-', '_')
                print("    %s = models.CharField(max_length=255)" % field)

            print("\n")
def analyze_fb_post_data(post_file):
    PERMALINK = 1
    MESSAGE = 2
    POST_DATE = 6
    TOTAL_REACH = 7
    IMPRESSIONS = 10

    print("Analying Facebook Post file {0}".format(post_file))
    post_data = list()

    with open(post_file) as f:
        csv = csvkit.reader(f)
        # skip first 2 lines
        csv.next()
        csv.next()
        for idx, r in enumerate(csv):
            post_data.append({
                'permalink': r[PERMALINK],
                'pubdate': datetime.datetime.strptime(r[POST_DATE], "%m/%d/%Y %H:%M:%S %p"),
                'message': r[MESSAGE],
                'totalreach': int(r[TOTAL_REACH]),
                'impressions': int(r[IMPRESSIONS])})

    start_date = min(post_data, key=lambda i: i['pubdate'])['pubdate']
    end_date = max(post_data, key=lambda i: i['pubdate'])['pubdate']

    sorted_data = sorted(post_data, key=lambda i: i['totalreach'], reverse=True)

    template = render_template(FB_POST_TMPL,
                               {'top_posts': sorted_data[:10],
                                'start_date': start_date,
                                'end_date': end_date})

    template = template.encode('utf-8')

    print("Generating FB Post Report {0}".format(FB_POST_REPORT_OUT_NAME))

    with open(FB_POST_REPORT_OUT_NAME, "wb") as outf:
        outf.write(template)

    return 0
Esempio n. 20
0
def generateCand():
	'''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta.
	   http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip
	'''
	lista = {}
	ufs = ["AC", "AL", "AM", "AP",  "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO","BR"]
	for uf in ufs:
		print 'Getting '+uf
		cand = open("../raw/consulta_cand_2014_"+uf+".txt", 'r')
		cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';')
		for c in cand:
			#if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas
			if c[9] not in ['DEPUTADO ESTADUAL']:
				lista[c[10]] = 0
			if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos
				lista[c[13]] = c[10]
			
			

	with open('names.js', 'w') as final:
		header ="var nick = "
		final.write(header+json.dumps(lista))
Esempio n. 21
0
    def parse_downloaded_file(self, file_path):
        print "Parse %s" % file_path
        with open(file_path,'rb') as f:
            content = f.read().replace("\r\n","\n").replace("\xef\xbb\xbf","")
            all_rows = list(csv.reader(StringIO(content), delimiter=";"))
            if len(all_rows) == 0:
                print "%s is empty." % file_path
                raise EmptyFileError("In file {}".format(file_path))

            title_row = all_rows[2]
            title = title_row[1]


            param_rows = [x for x in all_rows[-6:] if len(x) > 1]
            param_headers = [x[0].replace(":","").strip() for x in param_rows]
            param_values = [x[1].strip() for x in param_rows]

            data_rows = all_rows[3:-7]
            data_headers = [x.replace("\n","") for x in data_rows[0]]
            data_headers[0] = "Region"
            headers = param_headers + data_headers
            for row in data_rows[1:]:
                values = param_values + row
                datapoint = dict(zip(headers, values))
                # AMS changed their format slightly in 2017-05
                # This is a hack to get the old "Utrikesfödda"
                # in the same way as before
                # Current format seems faulty.
                if u"Utrikesfödda" not in datapoint.keys():
                    if u"utrikesfödda" in title:
                        datapoint[u"Utrikesfödda"] = "Ja"
                    else:
                        datapoint[u"Utrikesfödda"] = ""

                self.append(datapoint)


            return self
Esempio n. 22
0
    def from_csv(cls, path, column_info, header=True, **kwargs):
        """
        Create a new table for a CSV. Will use csvkit if it is available,
        otherwise will use Python's builtin csv module. ``args`` and ``kwargs``
        will be passed through to :meth:`csv.reader`.

        Note: if using Python 2 and not using csvkit, this method is not
        unicode-safe.

        :param path: Path to the CSV file to read from.
        :param column_info: See :class:`.Table` constructor.
        """
        with open(path) as f:
            rows = list(csv.reader(f, **kwargs))

        if header:
            column_names = rows.pop(0)

        if len(column_names) != len(column_info):
            # TKTK Better Error
            raise ValueError

        return Table(rows, column_info)
Esempio n. 23
0
def generateTSECand():
    '''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta.
	   http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip
	'''
    lista = {}
    ufs = [
        "AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS",
        "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC",
        "SE", "SP", "TO", "BR"
    ]
    for uf in ufs:
        print 'Getting ' + uf
        cand = open(
            "../raw/candidaturas2014/consulta_cand_2014_" + uf + ".txt", 'r')
        cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';')
        for c in cand:
            #if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas
            #if c[9] not in ['REMOVER']:
            #	lista[unidecode.unidecode(c[10])] = 0
            if c[9] in ['GOVERNADOR', 'PRESIDENTE'
                        ]:  #adiciona tambem o nome de urna nesses casos
                lista[unidecode.unidecode(c[10])] = 0
                lista[c[13]] = unidecode.unidecode(c[10])
    return lista
Esempio n. 24
0
    return [item for sublist in l for item in sublist]


def parseTournament(tournamentId, playerList):
    final_data = [
        parsePlayer(tournamentId, playerId) for playerId in playerList
    ]
    return [item for sublist in final_data for item in sublist]


#finalData = parseTournament(309, [66,686])


# load in our data
def readTournamentPlayers(row):
    r = [int(r) if r.isdigit() else r for r in row]
    playerList = [id for id in r[3:] if id != '']
    tournamentId = r[2]
    return {'tournamentId': tournamentId, 'playerList': playerList}


with open('tournaments.csv') as f:
    reader = csv.reader(f)
    reader.next()  #toss the header row
    tournaments = [readTournamentPlayers(row) for row in reader]

# print parsePlayer(142, 15)
# print parsePlayer(309, 686)

[parseTournament(**tournament) for tournament in tournaments]
    def handle(self, *args, **options):
        """
        Make it happen.
        """
        super(Command, self).handle(*args, **options)

        # set / compute any attributes that multiple class methods need
        self.keep_file = options["keep_file"]

        # get model based on strings of app_name and model_name
        self.model = apps.get_model(options["app_name"], options['model_name'])

        # load from provided csv or csv mapped to model
        self.csv = options["csv"] or self.model.objects.get_csv_path()

        # load into database suggested for model by router
        self.database = router.db_for_write(model=self.model)

        # get most recently cleaned RawDataFile
        try:
            raw_file = RawDataFile.objects.filter(
                file_name=self.model._meta.db_table,
                clean_start_datetime__isnull=False
            ).latest('clean_start_datetime')
        except RawDataFile.DoesNotExist:
            raise CommandError(
                'No record of cleaning {0}.TSV (run `python manage.py '
                'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table)
            )
        # raise exception if clean step did not finish
        if not raw_file.clean_finish_datetime:
            raise CommandError(
                'Previous cleaning of {0}.TSV did not finish (run `python manage.py '
                'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table)
            )

        # Get the row count from the source CSV
        with open(self.csv, 'r') as infile:
            self.csv_row_count = max(sum(1 for line in infile) - 1, 0)

        # Quit if the CSV is empty.
        if not self.csv_row_count:
            if self.verbosity > 2:
                self.failure("{} is empty.".format(self.csv))
            return

        # Get the headers from the source CSV
        with open(self.csv, 'r') as infile:
            csv_reader = reader(infile)
            self.csv_headers = next(csv_reader)

        # store the start time for the load
        raw_file.load_start_datetime = now()
        # reset the finish time for the load
        raw_file.load_finish_datetime = None
        # save here in case command doesn't finish
        raw_file.save()

        # Load table
        if self.verbosity > 2:
            self.log(" Loading {}".format(options['model_name']))
        self.load()

        # add load counts to raw_file_record
        raw_file.load_columns_count = len(self.model._meta.fields)
        raw_file.load_records_count = self.model.objects.count()

        # Log an error if the counts don't match
        if self.verbosity > 2 and raw_file.load_records_count != self.csv_row_count:
            msg = "  Table record count doesn't match CSV. {} in the table  vs. {} in the CSV."
            self.failure(msg.format(raw_file.load_records_count, self.csv_row_count))

        # if not keeping files, remove the csv file
        if not self.keep_file:
            os.remove(self.csv)

        # store the finish time for the load
        raw_file.load_finish_datetime = now()

        # and save the RawDataFile
        raw_file.save()
Esempio n. 26
0
    })
    # let's be polite
    time.sleep(1)
    print(r.url)
    s = Soup(r.text, 'html.parser')
    l = [parseTable(r, playerId, tournamentId) for r in s.find_all(class_='scorecard-table')]
    return [item for sublist in l for item in sublist]

def parseTournament(tournamentId, playerList):
    final_data = [parsePlayer(tournamentId, playerId) for playerId in playerList]
    return [item for sublist in final_data for item in sublist]

#finalData = parseTournament(309, [66,686])

# load in our data
def readTournamentPlayers(row):
    r = [int(r) if r.isdigit() else r for r in row]
    playerList = [id for id in r[3:] if id != '']
    tournamentId = r[2]
    return { 'tournamentId' : tournamentId, 'playerList' : playerList }

with open('tournaments.csv') as f:
    reader = csv.reader(f)
    reader.next() #toss the header row
    tournaments = [readTournamentPlayers(row) for row in reader]

# print parsePlayer(142, 15)
# print parsePlayer(309, 686)

[parseTournament(**tournament) for tournament in tournaments]
Esempio n. 27
0
 def test_reader_alias(self):
     with open('examples/test_utf8.csv') as f:
         reader = csvkit.reader(f, encoding='utf-8')
         self.assertEqual(next(reader), ['a', 'b', 'c'])
         self.assertEqual(next(reader), ['1', '2', '3'])
         self.assertEqual(next(reader), ['4', '5', u'ʤ'])
Esempio n. 28
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None,
                 column_ids=None, blanks_as_nulls=True, zero_based=False,
                 infer_types=True, no_header_row=False, **kwargs):
        """ Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a
        matching identifier to be parsed, type inferred, etc. However, their
        order/index property will reflect the original data (e.g. column 8
        will still be "order" 7, even if it's the third column in the resulting
        Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = reader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows)

            headers = make_default_headers(len(row))
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  zero_based)
            headers = [headers[c] for c in column_ids]
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = next(rows)

            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers,
                                                      zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))

            data_columns = [[] for c in headers]

        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c,
                                  blanks_as_nulls=blanks_as_nulls,
                                  infer_types=infer_types))

        return Table(columns, name=name)
Esempio n. 29
0
 def test_reader_alias(self):
     with open('examples/test_utf8.csv', encoding='utf-8') as f:
         reader = csvkit.reader(f)
         self.assertEqual(next(reader), ['a', 'b', 'c'])
         self.assertEqual(next(reader), ['1', '2', '3'])
         self.assertEqual(next(reader), ['4', '5', u'ʤ'])
import csvkit, sys
from collections import defaultdict

writer = csvkit.writer(sys.stdout)
with open(sys.argv[1]) as csv_file:
    for i, row in enumerate(csvkit.reader(csv_file)):
        if i == 0:
            col_count = len(row) - 1
            freqs = [defaultdict(int) for col in range(col_count)]
            continue
        for col in range(col_count):
            freqs[col][int(row[col + 1])] += 1
    values = sum((freqs[col].keys() for col in range(col_count)), [])
    for val in sorted(set(values)):
        val_freqs = [freqs[col][val] for col in range(col_count)]
        row = [val] + val_freqs
        writer.writerow(row)
Esempio n. 31
0
import requests
import csvkit
import os
import json
import io

#fetch files
with open('./out6_file3_address_3_clean.csv', 'rb') as f:
    reader = csvkit.reader(f)
    your_list = list(reader)
    print your_list[0][0]

#geocode
results = []
for i, val in enumerate(your_list):
    address = [val][0][0]
    params= {'text':address}
    url = 'http://localhost:3100/v1/search?'
    r = requests.get(url + 'text=' + address)
    rjson = r.json()['features'][0]
    rjson['properties']['query'] = address
    results.append(rjson)

with open('./out6_file3_address_3_clean.json', 'wb') as fd:
    fd.write(json.dumps(results))
~
#from postal.parser import parse_address
#parse_address('The Book Club 100-106 Leonard St Shoreditch London EC2A 4RH, United Kingdom')
Esempio n. 32
0
    def handle(self, *args, **options):
        verbosity = options['verbosity']
        if verbosity == '0':
            self.logger.setLevel(logging.ERROR)
        elif verbosity == '1':
            self.logger.setLevel(logging.WARNING)
        elif verbosity == '2':
            self.logger.setLevel(logging.INFO)
        elif verbosity == '3':
            self.logger.setLevel(logging.DEBUG)

        csvfile = options['csv_file']
        encoding = options['encoding']

        csv_out = out = options['out']

        if type(out) == str:
            csv_out = open(out, 'wb')

        writer = csv.writer(csv_out, delimiter=';', quotechar='"', encoding=encoding)
        writer.writerow(['slug', 'url', 'attivo', 'tema', 'natura', 'cup',
                         'programma', 'classificazione_qsn', 'fondo_comunitario',
                         'fin_totale_pubblico', 'fin_totale_pubblico_netto', 'pagamento',
                         'stato_progetto','stato_finanziamenti'])

        locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')

        with open(csvfile, 'rb') as cfile:
            reader = csv.reader(cfile, delimiter=',', quotechar='"')
            for r in reader:
                slug = None
                url = '-'
                output_r = r
                if not r:
                    continue

                url = r[0].strip()
                slug_search = re.search(
                    '^(http://){0,1}(www\.){0,1}opencoesione.gov.it/progetti/('
                    '.*?)/?$',
                    url, re.IGNORECASE
                )
                if slug_search:
                    slug = slug_search.group(3)

                if slug and '/' not in slug:
                    output_r = [slug, r[0]]

                try:
                    p = Progetto.fullobjects.get(slug=slug)
                    is_active = p.active_flag
                    tema = p.tema.tema_superiore.short_label
                    natura = p.classificazione_azione.classificazione_superiore\
                        .short_label
                    cup = p.cup
                    programma = ','.join([f.descrizione for f in p.fonti_fin])
                    class_qsn = p.classificazione_qsn.classificazione_superiore.classificazione_superiore.descrizione
                    fondo_com = p.get_fondo_comunitario_display()

                    fin_tot = locale.currency(p.fin_totale_pubblico).replace('Eu', u'€')
                    fin_tot_netto = locale.currency(p.fin_totale_pubblico_netto).replace('Eu', u'€')
                    pagamento = locale.currency(p.pagamento).replace('Eu', u'€')
                    stato_fin = p.get_stato_finanziario_display()
                    stato_prog = p.get_stato_progetto_display()

                    output_r.extend([is_active, tema, natura, cup, programma, class_qsn, fondo_com,
                                     fin_tot, fin_tot_netto, pagamento,
                                     stato_fin, stato_prog])
                except ObjectDoesNotExist:
                    pass

                self.logger.info(r[0])
                writer.writerow(output_r)
Esempio n. 33
0
#!/usr/bin/env python

# Remove newline chars from CSV "cells"
# Input is taken from stdin and output spit to stdout

import csvkit
import sys

reader = csvkit.reader(sys.stdin)
writer = csvkit.writer(sys.stdout)
for row in reader:
    for i in range(0, len(row)):
        if isinstance(row[i], str):
            if "\n" in row[i]:
                row[i] = row[i].replace("\n", '')
    writer.writerow(row)
Esempio n. 34
0
#! /usr/bin/env python
# from http://unix.stackexchange.com/questions/60590/is-there-a-command-line-utility-to-transpose-a-csv-file
import csvkit as csv, sys
rows = list(csv.reader(sys.stdin))
writer = csv.writer(sys.stdout)

for col in xrange(0, len(rows[0])):
    writer.writerow([row[col] for row in rows])
Esempio n. 35
0
File: dbadd.py Progetto: k-nish/GCI
#coding: utf8
import mysql.connector
import config
import csvkit

dbcon = mysql.connector.connect(database=config.db, user=config.user, password=config.passwd, host=config.host)
dbcur = dbcon.cursor()

sql1 = "drop table if exists rating;"
dbcur.execute(sql1);
print "table削除"

sql2 = "create table rating (userID text, placeID int, rating int, food_rating int, service_rating int);"
dbcur.execute(sql2);
print "table作成"

csv_data = csvkit.reader(file('/Users/K/dropbox/RCdata/rating_final.csv'))
for row in csv_data:
	sql = "INSERT INTO `rating`(`userID`, `placeID`, `rating`, `food_rating`, `service_rating`) VALUES (%s,%s,%s,%s,%s)"
	dbcur.execute(sql, row)

#実際にMySQLに反映させる
dbcon.commit()

dbcur.close()
dbcon.close()

Esempio n. 36
0
import codecs, csvkit

DATA = '../data/'
with open(DATA+'explicacoes.csv', 'rb') as csvfile:
	arquivo = csvkit.reader(csvfile, delimiter=',', encoding='utf-8')
	explicacoes = []
	for linha in arquivo:
		explicacao = {
			'_id' : linha[0],
			'sigla' : linha[0],
			'nome' : linha[1],
			'descricao' : linha[2].strip()
		}
		explicacoes.append(explicacao)

def mongo_save(explicacoes, clear=False):
    from pymongo import MongoClient
    client = MongoClient()
    db = client.monitorlegislativo
    collection = db.explicacoes
    if (clear):
        collection.drop()
    for e in explicacoes:
        collection.update({'_id' : e['_id']}, e, upsert=True)

mongo_save(explicacoes)
    def handle(self, *args, **options):
        """
        Make it happen.
        """
        super(Command, self).handle(*args, **options)

        # set / compute any attributes that multiple class methods need
        self.keep_file = options["keep_file"]

        # get model based on strings of app_name and model_name
        self.model = apps.get_model(options["app_name"], options['model_name'])

        # load from provided csv or csv mapped to model
        self.csv = options["csv"] or self.model.objects.get_csv_path()

        # load into database suggested for model by router
        self.database = router.db_for_write(model=self.model)

        # get most recently cleaned RawDataFile
        try:
            raw_file = RawDataFile.objects.filter(
                file_name=self.model._meta.db_table,
                clean_start_datetime__isnull=False).latest(
                    'clean_start_datetime')
        except RawDataFile.DoesNotExist:
            raise CommandError(
                'No record of cleaning {0}.TSV (run `python manage.py '
                'cleancalaccessrawfile {0}`).'.format(
                    self.model._meta.db_table))
        # raise exception if clean step did not finish
        if not raw_file.clean_finish_datetime:
            raise CommandError(
                'Previous cleaning of {0}.TSV did not finish (run `python manage.py '
                'cleancalaccessrawfile {0}`).'.format(
                    self.model._meta.db_table))

        # Get the row count from the source CSV
        with open(self.csv, 'r') as infile:
            self.csv_row_count = max(sum(1 for line in infile) - 1, 0)

        # Quit if the CSV is empty.
        if not self.csv_row_count:
            if self.verbosity > 2:
                self.failure("{} is empty.".format(self.csv))
            return

        # Get the headers from the source CSV
        with open(self.csv, 'r') as infile:
            csv_reader = reader(infile)
            self.csv_headers = next(csv_reader)

        # store the start time for the load
        raw_file.load_start_datetime = now()
        # reset the finish time for the load
        raw_file.load_finish_datetime = None
        # save here in case command doesn't finish
        raw_file.save()

        # Load table
        if self.verbosity > 2:
            self.log(" Loading {}".format(options['model_name']))
        self.load()

        # add load counts to raw_file_record
        raw_file.load_columns_count = len(self.model._meta.fields)
        raw_file.load_records_count = self.model.objects.count()

        # Log an error if the counts don't match
        if self.verbosity > 2 and raw_file.load_records_count != self.csv_row_count:
            msg = "  Table record count doesn't match CSV. {} in the table  vs. {} in the CSV."
            self.failure(
                msg.format(raw_file.load_records_count, self.csv_row_count))

        # if not keeping files, remove the csv file
        if not self.keep_file:
            os.remove(self.csv)

        # store the finish time for the load
        raw_file.load_finish_datetime = now()

        # and save the RawDataFile
        raw_file.save()
Esempio n. 38
0
#!/usr/bin/env python

# Remove newline chars from CSV "cells"
# Input is taken from stdin and output spit to stdout

import csvkit
import sys

reader = csvkit.reader(sys.stdin)
writer = csvkit.writer(sys.stdout)
for row in reader:
  for i in range(0, len(row)):
    if isinstance(row[i], (str, unicode)):
      if "\n" in row[i]:
        row[i] = row[i].replace("\n", '')
  writer.writerow(row)
Esempio n. 39
0
def openFile():
	with open('./out6_file3_address_3_clean.csv', 'rb') as f:
    		reader = csvkit.reader(f)
    		your_list = list(reader)
    		print your_list[0][0]