def __init__(self, file_paths):
     # DB model
     self.db_model = DocumentsExporterDbModel()
     # Path to output directory.
     self.file_paths = file_paths
     # Object for stock movements.
     self.stock_processor = StockPriceProcessor()
     # Define output document classes.
     self.doc_classes = {'up': '1', 'down': '2'}
     # Object for writing text files.
     self.text_writer = TextWriter(file_paths['output_dir'])
Beispiel #2
0
    # Save all data
    item_data = {}
    for item_row in rows:
        item_name = item_row.find_all('td')[1].text.strip()[0:-1]
        item_value = item_row.find_all('td')[2].text
        item_data[item_name] = ' '.join(item_value.strip().split())

    # Check values
    datum_zapisu = item_data[u'Datum zápisu'] if u'Datum zápisu' in item_data else 'false'
    uzemi = item_data[u'Území/Zeměpisná oblast'] if u'Území/Zeměpisná oblast' in item_data else 'false'

    # číslo přihlášky, znění, datum zápisu, území, zboží, stav
    item_list = [
        item_data[u'Číslo přihlášky'],
        item_data[u'Znění'],
        datum_zapisu,
        uzemi,
        item_data[u'Zboží/Výrobky'],
        item_data[u'Stav'],
    ]
    products.append(item_list)


# Prepare header
header = [u'číslo přihlášky', u'znění', u'datum zápisu', u'území', u'zboží', u'stav']
products.insert(0, header)
# Write data to file
tw = TextWriter()
tw.write_file('products', products)
Beispiel #3
0
    # Get Wiki page
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page, 'lxml')
    # Find infobox
    table = soup.find('table', class_='infobox')
    # Check if the table was found - if not, it's a past currency.
    if not table:
        continue
    # Find ISO code
    iso_cell = table.find('a', title='ISO 4217')
    # Check if the code was found - if not, it's not a regular currency.
    if not iso_cell:
        continue
    cur_code = iso_cell.parent.parent.contents[3].text.strip()
    cur_name = soup.select('#firstHeading')[0].text.strip()
    # Save the code
    c_symbols[symbol] = [cur_code, cur_name]

# Create list from c_symbols
c_list = []
for symbol, (code, name) in c_symbols.items():
    c_list.append([code, symbol, name])

# Prepare header
header = ['currency code', 'currency symbol', 'currency name']
c_list.insert(0, header)

# Write list to file
tw = TextWriter()
tw.write_file(c_list, 'currency_symbols_raw', ';')
Beispiel #4
0

def parse_d_level(text):
    d_levels = [x.strip().encode("utf-8") for x in result.group(4).split(",")]
    r_bc = True if "bakalářský" in d_levels else False
    r_ing = True if "magisterský navazující" in d_levels else False
    r_phd = True if "doktorský" in d_levels else False
    return r_bc, r_ing, r_phd


# Read file and get studies
f_obj = codecs.open("studies_in.txt", "r", encoding="utf-8")
for line in f_obj:
    # Parse line
    result = reg_exp.match(line)
    # Parse degree level
    (l_bc, l_ing, l_phd) = parse_d_level(result.group(4))
    # Save data
    row = [result.group(1), result.group(2), result.group(3), str(l_bc), str(l_ing), str(l_phd), result.group(5)]
    studies.append(row)

# Prepare header
header = ["study language", "field of study", "university", "bc", "ing", "phd", "capacity"]
studies.insert(0, header)

# Get path to the script directory.
output_dir = os.path.dirname(os.path.realpath(__file__))
# Create a CSV file from the list.
tw = TextWriter(output_dir)
tw.write_studies_file("studies_out", studies)
class DocumentsExporter(object):
    """
    Export documents for conversion to vectors and classification by machine learning algorithms.
    """

    def __init__(self, file_paths):
        # DB model
        self.db_model = DocumentsExporterDbModel()
        # Path to output directory.
        self.file_paths = file_paths
        # Object for stock movements.
        self.stock_processor = StockPriceProcessor()
        # Define output document classes.
        self.doc_classes = {'up': '1', 'down': '2'}
        # Object for writing text files.
        self.text_writer = TextWriter(file_paths['output_dir'])

    # PUBLIC METHODS

    def process_documents_for_all_companies(self, doc_type, from_date, days_delay, price_type, const_boundaries,
                                            balance_classes_for_company, docs_per_file=50000, only_one_file=True):
        print('===Processing %s===') % doc_type
        # Reset document counts.
        documents_count = 0
        files_count = 0
        # Create file name.
        f_number = '' if only_one_file else '_0'
        file_name = doc_type.replace('_', '-') + '_all_%s_%s_%s%s' % \
                                                 (price_type, str(days_delay), const_boundaries[1], f_number)
        # Process all companies.
        for comp in self.db_model.get_companies():
            #print('===Company %d===') % comp[0]
            # Process and write data for one company.
            new_docs_count = self.process_documents_for_company(doc_type, comp[0], from_date, days_delay,
                                                                price_type, const_boundaries,
                                                                balance_classes_for_company, file_name)
            documents_count += new_docs_count
            # Check if the file should be ended.
            if documents_count > docs_per_file:
                print('>>>NEW FILE')
                if only_one_file:
                    break
                else:
                    files_count += 1
                    documents_count = 0
                    file_name = re.sub('\d+$', str(files_count), file_name)
        # The end.
        print('>>>All %s for all companies exported. Total docs: %d ') % \
             (doc_type, files_count * docs_per_file + documents_count)


    def process_companies_by_source(self, file_desc, doc_type, from_date, to_date, days_delay, price_type,
                                    const_boundaries, balance_classes_for_company, docs_per_file=100000):
        # Get company IDs.
        company_ids = self.db_model.get_companies_by_doc_type(doc_type)
        c_ids_list = [x[0] for x in company_ids]
        # Process all selected companies.
        self.process_documents_for_selected_companies(
            c_ids_list, doc_type, from_date, to_date, days_delay, price_type,
            const_boundaries, balance_classes_for_company, docs_per_file, file_desc)


    def process_documents_for_selected_companies(self, companies_ids, doc_type, from_date, to_date, days_delay,
                                                 price_type, const_boundaries, balance_classes_for_company,
                                                 docs_per_file=100000, companies_filename=False, nonsearch_tweets_cids=False):
        print('===Processing %s===') % doc_type
        # Reset document counts.
        documents_count = 0
        # Calculate number of documents per company.
        docs_per_company = int(round(docs_per_file / float(len(companies_ids))))
        if doc_type == 'fb_comment':
            docs_per_company = 10000
        elif doc_type == 'tweet':
            docs_per_company = 100000
        # Choose file description string.
        if companies_filename:
            fs_comp = companies_filename
        else:
            if len(companies_ids) < 5:
                fs_comp = '-'.join(str(v) for v in companies_ids)
            else:
                fs_comp = 'm%s' % len(companies_ids)
        # Create file name.
        file_name = '%s_%s_%s_%s_%s' % \
                    (doc_type.replace('_', '-'), fs_comp, price_type, str(days_delay), const_boundaries[1])

        # Process all companies.
        for comp in self.db_model.get_selected_companies(companies_ids):
            #print('===Company %d===') % comp[0]
            # Process and write data for one company.
            new_docs_count = self.process_daily_documents_for_company(
                doc_type, comp[0], from_date, to_date, days_delay, price_type, const_boundaries,
                balance_classes_for_company, docs_per_company, file_name, nonsearch_tweets_cids)
            #print new_docs_count
            # Increment docs count.
            documents_count += new_docs_count
            # Check if the file should be ended.
            if documents_count > docs_per_file:
                print('>>>END FILE')
        # The end.
        print('>>>All %s for selected companies exported. Total docs: %d ') % (doc_type, documents_count)


    def process_daily_documents_for_company(self, doc_type, company_id, from_date, to_date, days_delay, price_type,
                                            const_boundaries, balance_classes, max_docs_per_company,
                                            total_file_name=False, nonsearch_tweets_cids=False):
        # Set stock prices for given company.
        prices = self.stock_processor.set_stock_prices(company_id, from_date, price_type)
        if not prices:
            return False
        # Calculate number of documents per day: n = docs_per_company / days(to_date - from_date)
        date_delta = to_date - from_date
        docs_per_day = int(round(max_docs_per_company / float(date_delta.days)))
        docs_per_day *= 2   # To get more documents, increase the count.
        if doc_type == 'tweet':
            docs_per_day = 200  # For Twitter
        if doc_type == 'fb_comment':
            docs_per_day = 40   # For Facebook
        # Example: 25 000 docs per company / 241 days = 104 docs per day
        #print ('>>Docs per company/days/per day: %d, %d, %d') % (max_docs_per_company, date_delta.days, docs_per_day)
        # Define variables.
        docs_query_limit = 400  # Do not change -- cached queries won't work. Original value: 400.
        total_doc_list = []
        docs_counter = 0
        processed_date = from_date
        day_plus = datetime.timedelta(days=1)
        # Choose if  get only non search tweets.
        if doc_type == 'tweet' and company_id in nonsearch_tweets_cids:
            doc_type = 'tweet_nonsearch'
        # For every day, get documents from DB.
        while processed_date <= to_date:
            #print processed_date
            # Get documents for current date from DB.
            if doc_type == 'fb_post':
                daily_documents = self.db_model.get_daily_fb_posts_for_company(company_id, processed_date, docs_query_limit)
            elif doc_type == 'fb_comment':
                daily_documents = self.db_model.get_daily_fb_comments_for_company(company_id, processed_date, docs_query_limit)
            elif doc_type == 'article':
                daily_documents = self.db_model.get_daily_articles_for_company(company_id, processed_date, docs_query_limit)
            elif doc_type == 'tweet':
                daily_documents = self.db_model.get_daily_tweets_for_company(company_id, processed_date, docs_query_limit)
            elif doc_type == 'tweet_nonsearch':
                daily_documents = self.db_model.get_daily_nonsearch_tweets_for_company(company_id, processed_date, docs_query_limit)
                doc_type = 'tweet'  # reset tweet doctype
            else:
                raise ValueError('Unknown document type.')
            # Process the documents.
            d_list = self._process_given_documents(daily_documents, doc_type, days_delay, price_type, const_boundaries, False)
            #print('Processed docs: %d') % d_length
            # Increment day.
            processed_date += day_plus
            # If there are no documents, continue with next date.
            if not d_list:
                continue
            # Check and edit number of available documents.
            if len(d_list) > docs_per_day:
                d_list = d_list[0:docs_per_day]
            #print('Saved docs: %d') % len(d_list)
            # Add documents to total list.
            total_doc_list.extend(d_list)
            docs_counter += len(d_list)
            # Check number of already saved documents.
            if docs_counter > max_docs_per_company:
                print('Max documents count (%d) for company reached.') % max_docs_per_company
                break   # Stop and write documents to file.

        # Check if there are any documents.
        if not total_doc_list:
            return False
        # Write documents from all dates.
        self._write_docs_to_file(total_doc_list, doc_type, company_id, days_delay, price_type, const_boundaries, total_file_name)
        # Return some information.
        return len(total_doc_list)


    def process_documents_for_company(self, doc_type, company_id, from_date, days_delay, price_type,
                                      const_boundaries, balance_classes, total_file_name=False):
        # Set stock prices for given company.
        prices = self.stock_processor.set_stock_prices(company_id, from_date, price_type)
        if not prices:
            return False
        # Get documents from DB.
        if doc_type == 'fb_post':
            documents = self.db_model.get_fb_posts_for_company(company_id, from_date)
        elif doc_type == 'fb_comment':
            documents = self.db_model.get_fb_comments_for_company(company_id, from_date)
        elif doc_type == 'article':
            documents = self.db_model.get_articles_for_company(company_id, from_date)
        elif doc_type == 'tweet':
            documents = self.db_model.get_tweets_for_company(company_id, from_date)
        else:
            raise ValueError('Unknown document type.')
        # Process the documents.
        d_list = self._process_given_documents(documents, doc_type, days_delay, price_type, const_boundaries, balance_classes)
        # Check if there are were any documents.
        if not d_list:
            return False
        # Write documents to correct file.
        self._write_docs_to_file(d_list, doc_type, company_id, days_delay, price_type, const_boundaries, total_file_name)
        # Return some information.
        return len(d_list)


    def change_output_dir(self, new_dir):
        # Update object attributes.
        self.file_paths['output_dir'] = new_dir
        self.text_writer.output_dir = new_dir
        # Check if directory exists. If it doesn't, create it.
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)


    # PRIVATE METHODS

    def _process_given_documents(self, documents, doc_type, days_delay, price_type, const_boundaries, balance_classes):
        # Prepare counts for class balance.
        count_class_up = 0
        count_class_down = 0
        # Process documents - create a list for writing to a file.
        new_docs_list = []
        for doc in documents:
            # Get document publication date
            if doc_type == 'fb_post' or doc_type == 'fb_comment':
                doc_date = datetime.datetime.utcfromtimestamp(doc['created_timestamp']).date()
            elif doc_type == 'article':
                doc_date = doc['published_date'].date()
            elif doc_type == 'tweet':
                doc_date = doc['created_at'].date()
            else:
                raise ValueError('Unknown document type.')
            # Get stock price movement direction.
            movement_direction = self.stock_processor.get_price_movement_with_delay(doc_date, days_delay, const_boundaries)
            # If the company was not on the stock exchange on this date, skip the post.
            if not movement_direction:
                continue
            # Skip documents with constant direction.
            if movement_direction == 'const':
                continue
            # Edit document text.
            if doc_type == 'fb_post' or doc_type == 'fb_comment' or doc_type == 'tweet':
                doc_text = self._process_facebook_text(doc['text'])
            elif doc_type == 'article':
                doc_text = self._process_article_text(doc['text'])
            # Check if the document is not empty (or too short).
            if len(doc_text) < 2:
                continue
            # Add created data to the list.
            new_docs_list.append([self.doc_classes[movement_direction], doc_text])
            # Increment variables.
            if movement_direction == 'up':
                count_class_up += 1
            elif movement_direction == 'down':
                count_class_down += 1
        # If set, balance document classes.
        if balance_classes:
            min_class_count = min([count_class_up, count_class_down])
            return self._balance_documents(new_docs_list, min_class_count)
        else:
            return new_docs_list


    def _balance_documents(self, docs_list, min_class_count):
        """
        Create a new documents list, where each class will have the same number of documents.

        :param docs_list: list: (class, text).
        :param min_class_count: int: Provided minimal class count.
        :return: list
        """
        # Class counts variables.
        c_1 = 0
        c_2 = 0
        new_docs_list = []
        # Loop through all documents.
        for i, doc in enumerate(docs_list):
            if doc[0] == '1':
                c_1 += 1
                if c_1 <= min_class_count:
                    new_docs_list.append(doc)
            if doc[0] == '2':
                c_2 += 1
                if c_2 <= min_class_count:
                    new_docs_list.append(doc)
        # Edit documents
        return new_docs_list


    def _write_docs_to_file(self, docs_list, doc_type, company_id, days_delay, price_type,
                            const_boundaries, total_file_name=False):
        # Choose the correct file name (bulk vs individual generating).
        if total_file_name:
            file_name = total_file_name
            file_mode = 'a'
        else:
            file_name = doc_type.replace('_', '-') + '_%s_%s_%s_%s' % (company_id, price_type, days_delay, const_boundaries[1])
            file_mode = 'w'
        # Write data to the file.
        self.text_writer.write_file_for_vectorization(file_name, docs_list, file_mode)
        return file_name


    # TEXT processing

    def _process_facebook_text(self, text):
        # Remove hash tag symbols.
        text = text.replace('#', '')
        # Remove at symbols.
        text = text.replace('@', '')
        # Replace URL links.
        text = re.sub(r'https?://\S+', 'XURL', text)
        # Replace emoticons with descriptions.
        text = re.sub(r':\)|:-\)|:D|=\)', ' XyzPosEmoticon ', text)
        text = re.sub(r':\(|:-\(', ' XyzNegEmoticon ', text)
        # Remove whitespace.
        text = ' '.join(text.strip().split())
        # Lowercase the text.
        text = text.lower()
        # Result
        return text

    def _process_article_text(self, text):
        # Remove URL links.
        #text = re.sub(r'(https?://\S+)|(www\.\w+\.\S+)', 'URL', text)
        text = re.sub(r'https?://\S+', 'XURL', text)
        # Remove paragraph tags.
        text = re.sub(r'<p>|</p>', '', text)
        # Lowercase the text.
        text = text.lower()
        # Result
        return text
Beispiel #6
0
        current_date.strftime('%A'),
        current_date.day,
        current_date.month,
        current_date.year,
        week_n,
    ]
    # Is it a weekend or workday?
    if week_day_n in [6, 7]:
        day_week_type = 'weekend'
    else:
        day_week_type = 'workday'
    # Save data
    day_data.append(day_week_type)
    total_data.append(day_data)
    # Increment current date by one day.
    current_date += datetime.timedelta(days=1)
    pk += 1

# Prepare header
header = [
    'DateKey', 'FullDateKey', 'DayNumberInWeek', 'DayName',
    'DayNumber', 'MonthNumber', 'YearNumber', 'WeekNumber', 'DayWeekType'
]
total_data.insert(0, header)

# Get path to the script directory.
output_dir = os.path.dirname(os.path.realpath(__file__))
# Create a CSV file from the list.
tw = TextWriter(output_dir)
tw.write_date_file('date_dim', total_data)