Beispiel #1
0
def write_to_file(file_name, companies, summary_columns, wb_sheet):
    """
    This method generates a new Excel file with the name <file_name> that will
    contains the data found in PreSeries for each entry in the original excel
    file


    :param file_name: the name of the file to be generated
    :param companies: a list of companies with the basic data found in PreSeries
    :param summary_columns: the columns in the original file that will be
        used in the new file to give more information about the companies.
    :param wb_sheet: the excel sheet of the original file where we will find
        the summary fields of the companies.
    """
    workbook = Workbook()
    companies_sheet = workbook.add_sheet('Companies')

    # Build the header names
    header = ["Original Row", "Company Name", "Domain", "Country"]
    header.extend(summary_columns) \
        if summary_columns and len(summary_columns) > 0 else None
    [companies_sheet.write(0, index, value)
     for index, value in enumerate(header)]

    for index, company_data in enumerate(companies):
        companies_sheet.write(1 + index, 0, company_data["row"])
        companies_sheet.write(
            1 + index, 1, company_data["name"].decode('utf-8', 'ignore')
            if "name" in company_data else "")
        companies_sheet.write(
            1 + index, 2, company_data["country_code"]
            if "country_code" in company_data else "")
        companies_sheet.write(
            1 + index, 3, company_data["domain"]
            if "domain" in company_data else "")
        for index2, summary_column in enumerate(summary_columns):
            try:
                columnvalue = wb_sheet.cell_value(
                    company_data["row"],
                    PreSeriesUtils.excel2num(summary_column)).encode('cp1252')
            except UnicodeEncodeError:
                columnvalue = wb_sheet.cell_value(
                    company_data["row"],
                    PreSeriesUtils.excel2num(summary_column)).encode('utf-8')
                pass

            companies_sheet.write(1 + index, 4 + index2,
                                  columnvalue.decode('utf-8', 'ignore'))
    workbook.save(file_name)
Beispiel #2
0
def dump_similar_objects(similar_by_company):
    """ This methods generates s CSV-like version of the similar companies
    of each company
    """

    # These are the basic fields of the similar that we want to export
    headers = [
        'Company ID'.encode('utf8'),
        'Company Name'.encode('utf8'),
        'Similar Company Id'.encode('utf8'),
        'Similar Company Name'.encode('utf8'),
        'Similar Company Score'.encode('utf8'),
        'Distance Btw Companies'.encode('utf8'),
        'Max Distance in Cluster'.encode('utf8'),
        'Similarity'.encode('utf8')]

    fields = [
        "company_id",
        "company_name",
        "similar_company_id",
        "similar_company_name",
        "similar_company_score",
        "distance",
        "max_distance",
        "similarity"
    ]

    resources = []
    [resources.extend(similar) for similar in similar_by_company.values()]

    return PreSeriesUtils.dump_opbjects(headers, fields, resources)
Beispiel #3
0
def dump_rounds_objects(founders):
    """ This methods generates s CSV-like version of the Company objects, a
    list of rows with columns
    """

    # These are the basic fields of the companies that we want to export
    headers = [
        'Company ID'.encode('utf8'),
        'Company Name'.encode('utf8'),
        'Stage Name'.encode('utf8'),
        'Date'.encode('utf8'),
        'Funding Type'.encode('utf8'),
        'Series'.encode('utf8'),
        'Amount'.encode('utf8')]

    fields = [
        "company_id",
        "company_name",
        "stage",
        "date",
        "funding_type",
        "series",
        "amount"
    ]

    return PreSeriesUtils.dump_opbjects(headers, fields, founders)
Beispiel #4
0
def dump_stages_objects(founders):
    """ This methods generates s CSV-like version of the Company stages, a
    list of rows with columns
    """

    # These are the basic fields of the companies that we want to export
    headers = [
        'Company ID'.encode('utf8'),
        'Company Name'.encode('utf8'),
        'Stage Name'.encode('utf8'),
        'Start Date'.encode('utf8'),
        'End Date'.encode('utf8'),
        'First Round Date'.encode('utf8'),
        'Last Round Date'.encode('utf8'),
        'Total Investment'.encode('utf8'),
        'Total Rounds'.encode('utf8')]

    fields = [
        "company_id",
        "company_name",
        "stage",
        "start_date",
        "end_date",
        "first_round_date",
        "last_round_date",
        "investment_amount",
        "total_rounds"
    ]

    return PreSeriesUtils.dump_opbjects(headers, fields, founders)
Beispiel #5
0
def dump_person_objects(founders):
    """ This methods generates s CSV-like version of the Company persons, a
    list of rows with columns
    """

    # These are the basic fields of the companies that we want to export
    headers = [
        'Company ID'.encode('utf8'),
        'Company Name'.encode('utf8'),
        'PreSeries ID'.encode('utf8'),
        'Firstname'.encode('utf8'),
        'Lastname'.encode('utf8'),
        'Crunchbase URL'.encode('utf8'),
        'Crunchbase Id'.encode('utf8'),
        'LinkedIn URL'.encode('utf8'),
        'Facebook URL'.encode('utf8'),
        'Twitter URL'.encode('utf8'),
        'Google+ URL'.encode('utf8'),
        'Gender'.encode('utf8'),
        'Birthdate'.encode('utf8'),
        'Updated on'.encode('utf8')]

    fields = [
        "company_id",
        "company_name",
        "person_id",
        "first_name",
        "last_name",
        "crunchbase_url",
        "crunchbase_uuid",
        "linkedin_url",
        "facebook_url",
        "twitter_url",
        "google_plus_url",
        "gender",
        "born",
        "updated",
    ]

    return PreSeriesUtils.dump_opbjects(headers, fields, founders)
Beispiel #6
0
    def read_search_data_from_excel(self,
                                    file_name,
                                    column_id=None,
                                    column_name=None,
                                    column_country=None,
                                    column_domain=None,
                                    skip_rows=False):
        """
        This method is responsible for extract from an Excel file all the
        companies we will need to find in PreSeries.

        for build the query parameters that
        we are going to use to look for the companies in PreSeries informed
        in an Excel file.

        The query string will have only the id criteria or the name of the
         company if the id is not informed. The domain and country_code won't
         be used in the query, we will use them later for select the best
         match from all the candidates that matched the query.

        :return: a list where each row is one company which contains a tuple
            with two items, the query string to look in preseries for the
            company and the map with all the parameters used in the query
        """

        logging.debug("Looking for the first sheet in the Excel.")
        wb = open_workbook(file_name)
        first_sheet = wb.sheets()[0]
        logging.debug("Sheet name [%s]." % first_sheet.name)

        self.companies_query = []
        for row in range(skip_rows, first_sheet.nrows):

            logging.debug("Processing row: %d" % row)

            if column_id:
                company_id = first_sheet.cell_value(
                    row, PreSeriesUtils.excel2num(column_id))
                self.companies_query.append(("id=%s" % company_id, {
                    "row": row,
                    "id": company_id
                }))
                continue

            query_string = {}
            query_params = {"row": row}

            if column_name and \
                    first_sheet.cell_value(
                        row, PreSeriesUtils.excel2num(column_name)):

                try:
                    company_name = first_sheet.cell_value(
                        row,
                        PreSeriesUtils.excel2num(column_name)).encode('cp1252')
                except UnicodeEncodeError:
                    company_name = first_sheet.cell_value(
                        row,
                        PreSeriesUtils.excel2num(column_name)).encode('utf-8')
                    pass

                query_string['name__icontains'] = company_name
                query_params["name"] = company_name

            if column_domain:
                company_domain = PreSeriesUtils.resolve_domain(
                    first_sheet.cell_value(
                        row, PreSeriesUtils.excel2num(column_domain)))

                if company_domain:
                    # We only use the domain after the search to select the
                    # best candidate
                    query_params["domain"] = company_domain

            if column_country and \
                    first_sheet.cell_value(
                        row, PreSeriesUtils.excel2num(column_country)):
                country_code = PreSeriesUtils.resolve_country(
                    first_sheet.cell_value(
                        row, PreSeriesUtils.excel2num(column_country)))

                if country_code:
                    # We only use the country_code after the search to
                    # select the best candidate
                    query_params['country_code'] = country_code

            self.companies_query.append(
                (urllib.urlencode(query_string), query_params))
Beispiel #7
0
    def search_companies(self):
        """
        We are going to get all the Companies from PreSeries using the search
        url calculated for each Company.

        We use the internal field "companies_query" to prepare the search.
        This property has a list of tuples, where each tuple contains the
        following information:
            - the "query string" to do the REST query
            - the "company_details" as a map with all the field-values of
                the company we want to look for in PreSeries.

        Ex.

            query_string = name__icontains=prese
            company_details = {
                "name": "PreSeries",
                "country_code": "ESP",
                "domain": "preseries.com"
            }

        The query string could, and should, not use all the company properties
        in the query to be more flexible. For instance, we can only prepare
        query strings using the "name" property to get from PreSeries
        as much companies as possible, to use later all the other properties
        (country_code, domain, etc) to decide which company is more likely to
        be the company we are looking for.

        :return: the companies found and the ones that were not found
        """
        found_companies = []
        unknown_companies = []

        for query_string, company_details in self.companies_query:
            # We download a maximum of 100 companies from the total that
            # matches the search criteria (limit=100)
            query = "limit=100&%s" % query_string
            logging.debug("Query: %s" % query)

            resp = self.api.search_companies(query_string=query)

            # We get multiple companies as a response.
            if resp['meta']['total_count'] > 1:
                best_candidate = PreSeriesUtils.select_best_company(
                    company_details, resp['objects'])

                logging.warn("More than one match!\n"
                             "Params: %s \n"
                             "Selected candidate: %s" %
                             (company_details, best_candidate))

                company_data = {"row": company_details["row"]}
                company_data.update(best_candidate)

                found_companies.append(
                    PreSeriesUtils.encoding_conversion(company_data))

            elif resp['meta']['total_count'] == 0:
                logging.warn("Unknown company: %s" % company_details)
                unknown_companies.append(company_details)

            else:
                company_data = {"row": company_details["row"]}
                company_data.update(resp["objects"][0])
                found_companies.append(
                    PreSeriesUtils.encoding_conversion(company_data))

        return found_companies, unknown_companies
Beispiel #8
0
def dump_company_objects(companies_details):
    """ This methods generates s CSV-like version of the Company objects, a
    list of rows with columns
    """

    # These are the basic fields of the companies that we want to export
    headers = [
        'PreSeries ID'.encode('utf8'),
        'Name'.encode('utf8'),
        'Elevator Pitch'.encode('utf8'),
        'Foundation date'.encode('utf8'),
        'Domain'.encode('utf8'),
        'Status'.encode('utf8'),
        'Country'.encode('utf8'),
        'City'.encode('utf8'),
        'Stage'.encode('utf8'),
        'Areas'.encode('utf8'),
        'Top Area'.encode('utf8'),
        'Headcount'.encode('utf8'),
        'Num of Founders'.encode('utf8'),
        'Locations'.encode('utf8'),
        'Diversification'.encode('utf8'),
        'Funding rounds'.encode('utf8'),
        'Total Funding'.encode('utf8'),
        'First funding on'.encode('utf8'),
        'Days to first funding'.encode('utf8'),
        'Last funding on'.encode('utf8'),
        'Days since last funding'.encode('utf8'),
        'Num of MBAs'.encode('utf8'),
        'Num of PhDs'.encode('utf8'),
        'Num of patents first year'.encode('utf8'),
        'Num of patents last year'.encode('utf8'),
        'Twitter bio'.encode('utf8'),
        'Twitter followers'.encode('utf8'),
        'Twitter following'.encode('utf8'),
        'Twitter tweets'.encode('utf8'),
        'Twitter url'.encode('utf8'),
        'Crunchbase url'.encode('utf8'),
        'LinkedIn url'.encode('utf8'),
        'Facebook url'.encode('utf8'),
        'Google Plus url'.encode('utf8'),
        'IPO %'.encode('utf8'),
        'Acquired %'.encode('utf8'),
        'Defunct %'.encode('utf8'),
        'Ratio - Influencer'.encode('utf8'),
        'Ratio - Traction'.encode('utf8'),
        'Country Rank'.encode('utf8'),
        'Country Rank Change'.encode('utf8'),
        'Country Rank Percentile'.encode('utf8'),
        'Country Rank Percentile Change'.encode('utf8'),
        'Area Rank'.encode('utf8'),
        'Area Rank Change'.encode('utf8'),
        'Area Rank Percentile'.encode('utf8'),
        'Area Rank Percentile Change'.encode('utf8'),
        'World Rank'.encode('utf8'),
        'World Rank Change'.encode('utf8'),
        'World Rank Percentile'.encode('utf8'),
        'World Rank Percentile Change'.encode('utf8'),
        'Score'.encode('utf8'),
        'Score Change'.encode('utf8'),
        'Tracked from'.encode('utf8'),
        'Updated on'.encode('utf8')]

    fields = [
        "company_id",
        "name",
        "company/elevator_pitch",
        "foundation_date",
        "domain",
        "status",
        "country_code",
        "city",
        "stage",
        "areas",
        "top_area",
        "headcount",
        "num_of_cofounders",
        "locations_list",
        "diversity_list",
        "funding_count",
        "funding_sum",
        "first_funding_on",
        "days_to_first_funding",
        "last_funding_on",
        "days_since_last_funding",
        "num_of_mbas",
        "num_of_phds",
        "num_patents_1st_year",
        "num_patents_on_exit_0",
        "twitter_bio",
        "twitter_followers",
        "twitter_following",
        "twitter_tweets",
        "twitter_url",
        "company/crunchbase_url",
        "company/linkedin_url",
        "company/facebook_url",
        "company/googleplus_url",
        "transition_ipo",
        "transition_acquired",
        "transition_defunct",
        "ratio_influencer",
        "ratio_traction",
        "country_rank",
        "country_rank_change",
        "country_rank_percentile",
        "country_rank_percentile_change",
        "area_rank",
        "area_rank_change",
        "area_rank_percentile",
        "area_rank_percentile_change",
        "world_rank",
        "world_rank_change",
        "world_rank_percentile",
        "world_rank_percentile_change",
        "score",
        "score_change",
        "tracked_from",
        "updated_on",
    ]

    return PreSeriesUtils.dump_opbjects(headers, fields, companies_details)