Example #1
0
 def parse_statistics(self):
     time_spent = time.time() - self.start_time
     result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format(
         self.parsed_successfully, self.parsed_unsuccessfully, self.skipped,
         time_spent)
     print result_string
     if self.savestats:
         output_file_name = 'parse_statistics_' + timezone.now().strftime(
             "%Y%m%d-%H%m") + '.txt'
         failed_csv_output_file = 'failed_rows_' + timezone.now().strftime(
             "%Y%m%d-%H%m") + '.csv'
         result_string += '\n Failed rows saved to {}'.format(
             failed_csv_output_file)
         print 'Saving extended statistics to file: {}'.format(
             output_file_name)
         f = open(output_file_name, 'w')
         f.write(result_string)
         f.close()
         with open(failed_csv_output_file, 'wb') as csv_output:
             writer = UnicodeWriter(csv_output,
                                    quotechar='"',
                                    delimiter=';')
             output_file_name.write('Next rows failed to be parsed\n')
             for r in self.failed_rows:
                 try:
                     for i in xrange(len(
                             r)):  # cleaner way "for x in r" fails to work
                         r[i] = r[i].decode('UTF-8')
                     writer.writerow(r)
                 except BaseException as e:
                     print e
                     print r
    def __init__(self,
                 corpus,
                 parser=sentence_parse,
                 compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """

        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities

        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0

        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()

        self.sequencer = PhraseSequencer(corpus)
Example #3
0
    def write_tweets(self, file_name, tweets, columns):

        if file_name is None or tweets is None:
            return None

        unicode_writer = UnicodeWriter(open(file_name, 'w'))
        unicode_writer.writerow(columns)
        for tweet in tweets:
            _tmp = []
            for column in columns:
                if column in tweet:
                    _tmp.append(tweet[column])
                else:
                    _tmp.append('')
            unicode_writer.writerow(_tmp)
Example #4
0
def main():
    logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
    args = parse_args()
    with args.output:
        csv_writer = UnicodeWriter(args.output)
        csv_writer.writerow(bacparser.models.get_model(args.year)._fields)
        for filename in args.filenames:
                logging.info('Converting %s' % (filename,))
                with open_compressed_file(filename) as f:
                    unpickler = pickle.Unpickler(f)
                    try:
                        while True:
                            o = unpickler.load()
                            csv_writer.writerow(o)
                    except EOFError:
                        pass
Example #5
0
def process_folder(folder):
    """
    Writes annotations from all FoLiA .xml-files from the given folder to a .csv-file.
    """
    # Create an output .csv-file
    with open(os.path.join(folder, 'annotations.csv'), 'wb') as csv_file:
        csv_file.write(u'\uFEFF'.encode('utf-8'))  # the UTF-8 BOM to hint Excel we are using that...
        csv_writer = UnicodeWriter(csv_file, delimiter=';')
        csv_writer.writerow(['tekstnummer', 'zin nr',
                             'geannoteerde passage', 'correctie',
                             'eenheid', 'probleem', 'woordsoort',
                             'originele zin', 'gecorrigeerde zin'])

        # Loop over all .xml-files in the given folder
        for filename in glob.glob(os.path.join(folder, '*.xml')):
            print 'Processing ', filename
            process_file(csv_writer, filename)
Example #6
0
def get_company_attr(market, urls, output_file):
    col_headers = [  #維基百科上原本就有的欄位
        'Industry',
        'Products',
        'Type',
        'Traded as',
        'Area served',
        'Parent',
        'Subsidiaries',
        'Key people',
        'Genre',
        'Services',
        'Owner(s)',
        'Employees',
        'Website',
        'Logo',
    ]
    extra_col_headers = [  #經過處理後產生的欄位
        'Symbol',
    ]
    csv_writer = UnicodeWriter(open(output_file, 'wb'),
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL,
                               encoding='utf-8')
    csv_writer.writerow(['Wiki link'] + col_headers + extra_col_headers)

    re_obj = re.compile("%s : (?P<symbol>\w*)" % market)

    for url in company_wiki_link_generator(urls):  #每個公司的wiki頁面
        print url
        response = get_web_contents(url)
        d = pq(response)
        p = d(".infobox")
        infobox = p.html()  #取出公司資料的table

        d = pq(infobox)
        rows = d.find('tr')

        data = [None] * (len(col_headers) + len(extra_col_headers))
        symbol = ''
        for row in rows:  #公司資料table中的每一列
            th_txt = pq(row).find('th').text()
            td = pq(row).find('td')
            if td.attr('class') == 'logo':
                td_txt = td.find('img').attr('src')
                data[col_headers.index('Logo')] = td_txt
            if th_txt in col_headers:
                td_txt = td.text()
                data[col_headers.index(th_txt)] = td_txt

            #從Wiki的Type或Trade as欄位取的股票的Symbol

            if th_txt == 'Type':
                try:
                    symbol = re_obj.search(td_txt).group("symbol")
                except Exception:
                    pass
            if (not symbol) and th_txt == 'Traded as':
                try:
                    symbol = re_obj.search(td_txt).group("symbol")
                except Exception:
                    pass
            data[len(col_headers) + extra_col_headers.index("Symbol")] = symbol

        csv_writer.writerow([url] + data)