Esempio n. 1
0
    def export_as_csv(modeladmin, request, queryset):
        """ Generic csv export admin action.
        Based on http://djangosnippets.org/snippets/2712/
        """
        model = modeladmin.model
        opts = modeladmin.model._meta
        field_names = [field.name for field in opts.fields]

        if exclude:
            field_names = [f for f in field_names if f not in exclude]
        elif fields:
            # copy the list, not just assign the list
            field_names = list(fields)

        labels = prep_label(model, field_names)

        if extra:
                field_names += extra
                labels += prep_extra_label(model, extra)

        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename=%s.csv' % (
            unicode(opts).replace('.', '_')
        )

        writer = UnicodeWriter(response)

        if header:
            writer.writerow(labels)

        for obj in queryset:
            writer.writerow([prep_field(obj, field) for field in field_names])

        return response
Esempio n. 2
0
 def parse_statistics(self):
     time_spent = time.time() - self.start_time
     result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format(
         self.parsed_successfully, self.parsed_unsuccessfully, self.skipped,
         time_spent)
     print result_string
     if self.savestats:
         output_file_name = 'parse_statistics_' + timezone.now().strftime(
             "%Y%m%d-%H%m") + '.txt'
         failed_csv_output_file = 'failed_rows_' + timezone.now().strftime(
             "%Y%m%d-%H%m") + '.csv'
         result_string += '\n Failed rows saved to {}'.format(
             failed_csv_output_file)
         print 'Saving extended statistics to file: {}'.format(
             output_file_name)
         f = open(output_file_name, 'w')
         f.write(result_string)
         f.close()
         with open(failed_csv_output_file, 'wb') as csv_output:
             writer = UnicodeWriter(csv_output,
                                    quotechar='"',
                                    delimiter=';')
             output_file_name.write('Next rows failed to be parsed\n')
             for r in self.failed_rows:
                 try:
                     for i in xrange(len(
                             r)):  # cleaner way "for x in r" fails to work
                         r[i] = r[i].decode('UTF-8')
                     writer.writerow(r)
                 except BaseException as e:
                     print e
                     print r
    def __init__(self,
                 corpus,
                 parser=sentence_parse,
                 compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """

        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities

        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0

        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()

        self.sequencer = PhraseSequencer(corpus)
Esempio n. 4
0
def artists(request):
    response = HttpResponse(content_type='text/plain')
    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename="artists.csv"'

    writer = UnicodeWriter(response)
    fieldNames = TArtist.field_names()
    writer.writerow(fieldNames)
    
    for artist in TArtist.objects.all()[:1000]:
        values = []
        for fieldName in fieldNames:
            values.append(artist.field_value(fieldName))
        writer.writerow(values)

    return response
Esempio n. 5
0
def csvwrite(filename, verbose=None, codepage="cp1252"):
    """ Write the file on the desired format """
    records = 0
    table = dbf.Table(filename, codepage=codepage)
    table.open()
    with open("{f}.csv".format(f=filename), "w") as csvfile:
        # Include the first line as a comma-separated line, no-quoting
        csvfile.write(",".join(table.field_names)+"\n")
        csvwriter = UnicodeWriter(csvfile, quoting=QUOTE_ALL )
        for records, row in enumerate(table, start=1):
            csvwriter.writerow([field.strip() for field in row])
            if verbose:
                # print a dot every 1000 records
                if records % 1000 == 0:
                    print('.', end="")
    return records
Esempio n. 6
0
def process_folder(folder):
    """
    Writes annotations from all FoLiA .xml-files from the given folder to a .csv-file.
    """
    # Create an output .csv-file
    with open(os.path.join(folder, 'annotations.csv'), 'wb') as csv_file:
        csv_file.write(u'\uFEFF'.encode('utf-8'))  # the UTF-8 BOM to hint Excel we are using that...
        csv_writer = UnicodeWriter(csv_file, delimiter=';')
        csv_writer.writerow(['tekstnummer', 'zin nr',
                             'geannoteerde passage', 'correctie',
                             'eenheid', 'probleem', 'woordsoort',
                             'originele zin', 'gecorrigeerde zin'])

        # Loop over all .xml-files in the given folder
        for filename in glob.glob(os.path.join(folder, '*.xml')):
            print 'Processing ', filename
            process_file(csv_writer, filename)
Esempio n. 7
0
 def parse_statistics(self):
     time_spent = time.time() - self.start_time
     result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format(
         self.parsed_successfully, self.parsed_unsuccessfully, self.skipped, time_spent)
     print result_string
     if self.savestats:
         output_file_name = 'parse_statistics_' + timezone.now().strftime("%Y%m%d-%H%m") + '.txt'
         failed_csv_output_file = 'failed_rows_' + timezone.now().strftime("%Y%m%d-%H%m") + '.csv'
         result_string += '\n Failed rows saved to {}'.format(failed_csv_output_file)
         print 'Saving extended statistics to file: {}'.format(output_file_name)
         f = open(output_file_name, 'w')
         f.write(result_string)
         f.close()
         with open(failed_csv_output_file, 'wb') as csv_output:
             writer = UnicodeWriter(csv_output, quotechar='"', delimiter=';')
             output_file_name.write('Next rows failed to be parsed\n')
             for r in self.failed_rows:
                 try:
                     for i in xrange(len(r)):  # cleaner way "for x in r" fails to work
                         r[i] = r[i].decode('UTF-8')
                     writer.writerow(r)
                 except BaseException as e:
                     print e
                     print r
Esempio n. 8
0
    def write_tweets(self, file_name, tweets, columns):

        if file_name is None or tweets is None:
            return None

        unicode_writer = UnicodeWriter(open(file_name, 'w'))
        unicode_writer.writerow(columns)
        for tweet in tweets:
            _tmp = []
            for column in columns:
                if column in tweet:
                    _tmp.append(tweet[column])
                else:
                    _tmp.append('')
            unicode_writer.writerow(_tmp)
Esempio n. 9
0
def main():
    logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
    args = parse_args()
    with args.output:
        csv_writer = UnicodeWriter(args.output)
        csv_writer.writerow(bacparser.models.get_model(args.year)._fields)
        for filename in args.filenames:
                logging.info('Converting %s' % (filename,))
                with open_compressed_file(filename) as f:
                    unpickler = pickle.Unpickler(f)
                    try:
                        while True:
                            o = unpickler.load()
                            csv_writer.writerow(o)
                    except EOFError:
                        pass
Esempio n. 10
0
    def __init__(self, corpus, parser=sentence_parse, compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """
        
        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities
        
        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0
        
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()
        
        self.sequencer = PhraseSequencer(corpus)
Esempio n. 11
0
def get_company_attr(market, urls, output_file):
    col_headers = [  #維基百科上原本就有的欄位
        'Industry',
        'Products',
        'Type',
        'Traded as',
        'Area served',
        'Parent',
        'Subsidiaries',
        'Key people',
        'Genre',
        'Services',
        'Owner(s)',
        'Employees',
        'Website',
        'Logo',
    ]
    extra_col_headers = [  #經過處理後產生的欄位
        'Symbol',
    ]
    csv_writer = UnicodeWriter(open(output_file, 'wb'),
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL,
                               encoding='utf-8')
    csv_writer.writerow(['Wiki link'] + col_headers + extra_col_headers)

    re_obj = re.compile("%s : (?P<symbol>\w*)" % market)

    for url in company_wiki_link_generator(urls):  #每個公司的wiki頁面
        print url
        response = get_web_contents(url)
        d = pq(response)
        p = d(".infobox")
        infobox = p.html()  #取出公司資料的table

        d = pq(infobox)
        rows = d.find('tr')

        data = [None] * (len(col_headers) + len(extra_col_headers))
        symbol = ''
        for row in rows:  #公司資料table中的每一列
            th_txt = pq(row).find('th').text()
            td = pq(row).find('td')
            if td.attr('class') == 'logo':
                td_txt = td.find('img').attr('src')
                data[col_headers.index('Logo')] = td_txt
            if th_txt in col_headers:
                td_txt = td.text()
                data[col_headers.index(th_txt)] = td_txt

            #從Wiki的Type或Trade as欄位取的股票的Symbol

            if th_txt == 'Type':
                try:
                    symbol = re_obj.search(td_txt).group("symbol")
                except Exception:
                    pass
            if (not symbol) and th_txt == 'Traded as':
                try:
                    symbol = re_obj.search(td_txt).group("symbol")
                except Exception:
                    pass
            data[len(col_headers) + extra_col_headers.index("Symbol")] = symbol

        csv_writer.writerow([url] + data)
Esempio n. 12
0
class DocumentIngester(object):
    
    def __init__(self, corpus, parser=sentence_parse, compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """
        
        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities
        
        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0
        
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()
        
        self.sequencer = PhraseSequencer(corpus)
        
    
    def _record_document(self, text, phrases, metadata):
        doc_id = self.next_id
        self.next_id += 1
        
        formatted_metadata = ",".join([('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items()])
        self.document_writer.writerow([str(self.corpus.id), str(doc_id), text, formatted_metadata])
        
        for (phrase_id, indexes) in phrases:
            formatted_indexes = '"{%s}"' % ", ".join(['""(%s, %s)""' % (start, end) for (start, end) in indexes])
            self.occurrence_file.write("%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes))

        return doc_id 
        
        
    def _upload_new_documents(self):
        """Upload document text and phrase occurrences
        
        Return list of new document_ids
        
        """
        
        self.document_file.flush()
        self.document_file.seek(0)
        self.corpus.upload_csv(self.document_file, 'documents')
        self.document_file.close()
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = csv.writer(self.document_file)

        self.occurrence_file.flush()
        self.occurrence_file.seek(0)
        self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences')
        self.occurrence_file.close()
        self.occurrence_file = tempfile.TemporaryFile()


    def ingest(self, docs):
        """Ingest set of new documents"""
        
        new_doc_ids = list()
        
        print "parsing %s documents..." % len(docs)
    
        for doc in docs:
            if isinstance(doc, basestring):
                text = doc
                metadata = {}
            else:
                text = doc['text']
                metadata = doc['metadata']

            phrases = self.parser.__call__(text, self.sequencer)
            id = self._record_document(text, phrases, metadata)
            new_doc_ids.append(id)
            
        print "uploading documents..."
        
        self.sequencer.upload_new_phrases()
        self._upload_new_documents()
        
        if self.should_compute_similarities:
            print "computing similarities..."
            self.compute_similarities(new_doc_ids)

    @staticmethod
    def _pairs_for_comparison(all_ids, new_ids):
        allowed_ids = set(all_ids)
        all_ids = list(all_ids)
        all_ids.sort()
    
        new_ids = list(new_ids)
        new_ids.sort(reverse=True)
    
        for x in all_ids:
            for y in new_ids:
                if x >= y:
                    break
                if y in allowed_ids:
                    yield (x, y)

    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()
    
        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)
                
                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()
class DocumentIngester(object):
    def __init__(self,
                 corpus,
                 parser=sentence_parse,
                 compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """

        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities

        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0

        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()

        self.sequencer = PhraseSequencer(corpus)

    def _record_document(self, text, phrases, metadata):
        doc_id = self.next_id
        self.next_id += 1

        formatted_metadata = ",".join([
            ('"%s"=>"%s"' %
             (key, value.replace('\\', '\\\\').replace('"', '\\"')))
            for (key, value) in metadata.items()
        ])
        self.document_writer.writerow(
            [str(self.corpus.id),
             str(doc_id), text, formatted_metadata])

        for (phrase_id, indexes) in phrases:
            formatted_indexes = '"{%s}"' % ", ".join(
                ['""(%s, %s)""' % (start, end) for (start, end) in indexes])
            self.occurrence_file.write(
                "%s,%s,%s,%s\n" %
                (self.corpus.id, doc_id, phrase_id, formatted_indexes))

        return doc_id

    def _upload_new_documents(self):
        """Upload document text and phrase occurrences
        
        Return list of new document_ids
        
        """

        self.document_file.flush()
        self.document_file.seek(0)
        self.corpus.upload_csv(self.document_file, 'documents')
        self.document_file.close()
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = csv.writer(self.document_file)

        self.occurrence_file.flush()
        self.occurrence_file.seek(0)
        self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences')
        self.occurrence_file.close()
        self.occurrence_file = tempfile.TemporaryFile()

    def ingest(self, docs):
        """Ingest set of new documents"""

        new_doc_ids = list()

        print "parsing %s documents..." % len(docs)

        for doc in docs:
            if isinstance(doc, basestring):
                text = doc
                metadata = {}
            else:
                text = doc['text']
                metadata = doc['metadata']

            phrases = self.parser.__call__(text, self.sequencer)
            id = self._record_document(text, phrases, metadata)
            new_doc_ids.append(id)

        print "uploading documents..."

        self.sequencer.upload_new_phrases()
        self._upload_new_documents()

        if self.should_compute_similarities:
            print "computing similarities..."
            self.compute_similarities(new_doc_ids)

    @staticmethod
    def _pairs_for_comparison(all_ids, new_ids):
        allowed_ids = set(all_ids)
        all_ids = list(all_ids)
        all_ids.sort()

        new_ids = list(new_ids)
        new_ids.sort(reverse=True)

        for x in all_ids:
            for y in new_ids:
                if x >= y:
                    break
                if y in allowed_ids:
                    yield (x, y)

    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()

        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)

                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()