Beispiel #1
0
class Ops2:

    def setup(self):
        N = 10**3
        self.df = DataFrame(np.random.randn(N, N))
        self.df2 = DataFrame(np.random.randn(N, N))

        self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
                                                  np.iinfo(np.int16).max,
                                                  size=(N, N)))
        self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
                                                   np.iinfo(np.int16).max,
                                                   size=(N, N)))

        self.s = Series(np.random.randn(N))

    # Division

    def time_frame_float_div(self):
        self.df // self.df2

    def time_frame_float_div_by_zero(self):
        self.df / 0

    def time_frame_float_floor_by_zero(self):
        self.df // 0

    def time_frame_int_div_by_zero(self):
        self.df_int / 0

    # Modulo

    def time_frame_int_mod(self):
        self.df_int % self.df2_int

    def time_frame_float_mod(self):
        self.df % self.df2

    # Dot product

    def time_frame_dot(self):
        self.df.dot(self.df2)

    def time_series_dot(self):
        self.s.dot(self.s)

    def time_frame_series_dot(self):
        self.df.dot(self.s)
def numpy_dot():
    '''
    Imagine a point system in which each country is awarded 4 points for each
    gold medal,  2 points for each silver medal, and one point for each 
    bronze medal.  

    Using the numpy.dot function, create a new dataframe called 
    'olympic_points_df' that includes:
        a) a column called 'country_name' with the country name
        b) a column called 'points' with the total number of points the country
           earned at the Sochi olympics.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
 
    # YOUR CODE HERE
    olympic_medal_counts_df = { 'gold' : Series(gold), 'silver' : Series(silver),
                                   'bronze':Series(bronze)}
    vector = [4,2,1]
    df = DataFrame(olympic_medal_counts_df)
    df = df[['gold', 'silver', 'bronze']]
    print df
    
    points = df.dot(vector)
    olympic_points_df = DataFrame({'country_name': Series(countries),'points': Series(points)})
    return olympic_points_df
def metabolite_distance(model, model_biomass=None, drop_metabolites=None):
    # Calculate metabolite-reaction distance matrix
    V = model.get_stoichiometric_matrix()

    # Remove exchange and biomass reactions
    V = V.drop(np.append(model.get_exchanges(check_matrix=True), model_biomass), axis=1) if model_biomass else V.drop(model.get_exchanges(check_matrix=True), axis=1)

    # Convert floating stoichiometric values to 1
    V = DataFrame([[i if i == 0 else 1 for i in j] for j in V.values], index=V.index, columns=V.columns)

    # Remove highly connected metabolites
    V = V.drop(drop_metabolites)

    # Multiply stoichiometric matrix by its transpose
    M = V.dot(V.T).abs()

    # Get shortest path lengths for all metabolites
    G = DataFrame(nx.all_pairs_dijkstra_path_length(nx.from_numpy_matrix(M.values, create_using=nx.DiGraph())))
    G = G.set_index(M.index)
    G.columns = M.index

    return G
def _measure_cos_sim(columns_set: pd.DataFrame, rows_set: pd.DataFrame):
    similarity_matrix = rows_set.dot(columns_set.transpose())
    return similarity_matrix
Beispiel #5
0
class GradeBook(object):
    """A class encapsulating a pandas DataFrame and meant to store 
    the grades for a whole class. It provides the method compute_total_grades
    that compute the total grade for each student according to a weights provided
    by the caller.
    """

    def __init__(self, grade_arr, student_ids, item_list, max_scores):
        """
        Constructor of the class grade frame: 
	It should set the following attributes:

	(1) self.raw_grades, which is a DataFrame with 
	        - row labels given by student_ids
	        - column labels given by item_list
	        - values given by grade_arr

        (2) self.total_grades, set to None

	(3) self.letter_grades, set to None

	(4) self.max_scores, set to max_scores
        
        Parameters
        ----------
        grade_arr : numpy array of grades as returned by simulate_grades

        student_ids: a list of student ids 

	item_list: a list of grade items (e.g. ['HW', 'M', 'F'])

	max_scores: a list of the maximum possible score for each grade item
        
        Returns
        -------
        nothing 
        
        Examples
        --------
        >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50])
        >>> a.letter_grades == None
        True
        >>> a.total_grades == None
        True
        >>> a.raw_grades.shape == (2,2)
        True
        >>> a.raw_grades.ix[0,0] == 1
        True
	    >>> a.max_scores[0] == 30
	    True
        """
        self.raw_grades = DataFrame(grade_arr, index=student_ids, columns=item_list)
        self.total_grades = None 
        self.letter_grades = None
        self.max_scores = max_scores
 

    def compute_total_grades(self, item_weights=None, max_score=100):
        """
        Compute student total class grades as a weighted average of the column in self.raw_grades 
        according to the weights passed to item_weight for each of the columns.
        The student total class grades are then stored in the Series attribute self.total_grades
        The return value should be a Series containing a numerical summary
        (as returned by the Series method describe) of the total class grade distribution. 
    
        Parameters
        ----------
        item_weights: list of floats summing up to one
            List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3]) 
        
        max_score: float 
            Maximal possible score for the total class grade	
    
        Returns
        -------
        out : Series 
            A Series containing a numerical summary of the total 
    	grade distribution previously stored by the function 
    	in the attribute self.total_grades; this Series is the
    	output of the Series method describe.
        ----
    
        Examples
        --------
        >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10])
	    >>> b = a.compute_total_grades([0.5, 0.5], 100)
	    >>> len(b) == 5
	    False
	    >>> a.total_grades['22'] == 50
	    True
	    >>> a.total_grades['34'] == 10
	    True
        """
        self.total_grades = self.raw_grades.dot(pd.Series(item_weights, index=self.raw_grades.columns))
        percent = (1.0 / pd.Series(self.max_scores, index=self.total_grades.index) ) * max_score
        self.total_grades *= percent
        return self.total_grades.describe()
Beispiel #6
0
    def test_dot(self):
        a = DataFrame(np.random.randn(3, 4),
                      index=["a", "b", "c"],
                      columns=["p", "q", "r", "s"])
        b = DataFrame(np.random.randn(4, 2),
                      index=["p", "q", "r", "s"],
                      columns=["one", "two"])

        result = a.dot(b)
        expected = DataFrame(np.dot(a.values, b.values),
                             index=["a", "b", "c"],
                             columns=["one", "two"])
        # Check alignment
        b1 = b.reindex(index=reversed(b.index))
        result = a.dot(b)
        tm.assert_frame_equal(result, expected)

        # Check series argument
        result = a.dot(b["one"])
        tm.assert_series_equal(result, expected["one"], check_names=False)
        assert result.name is None

        result = a.dot(b1["one"])
        tm.assert_series_equal(result, expected["one"], check_names=False)
        assert result.name is None

        # can pass correct-length arrays
        row = a.iloc[0].values

        result = a.dot(row)
        expected = a.dot(a.iloc[0])
        tm.assert_series_equal(result, expected)

        with pytest.raises(ValueError, match="Dot product shape mismatch"):
            a.dot(row[:-1])

        a = np.random.rand(1, 5)
        b = np.random.rand(5, 1)
        A = DataFrame(a)

        # TODO(wesm): unused
        B = DataFrame(b)  # noqa

        # it works
        result = A.dot(b)

        # unaligned
        df = DataFrame(np.random.randn(3, 4),
                       index=[1, 2, 3],
                       columns=range(4))
        df2 = DataFrame(np.random.randn(5, 3),
                        index=range(5),
                        columns=[1, 2, 3])

        with pytest.raises(ValueError, match="aligned"):
            df.dot(df2)
Beispiel #7
0
def plot_eff(exp_rets: pd.DataFrame,
             cov: pd.DataFrame,
             n_points: int = 25,
             risk_free_rate: float = .1,
             show_cml: bool = False,
             show_ew: bool = False,
             show_gmv: bool = False,
             style: str = '.-',
             size: tuple = (12, 6),
             is_return: bool = False,
             save: bool = False):
    """Imprime a fronteira eficiente, baseada nos retornos
    esperados e a matriz de covariância.

    Args:
        exp_rets (pd.DataFrame): retornos esperados dos ativos.
        cov (pd.DataFrame): matriz de covariância dos ativos.
        n_points (int, optional): número de pontos a serem exibidos
        na fronteira. Padrão: 25.
        risk_free_rate (float, optional): taxa livre de risco.
        Padrão: 0.1.
        show_cml (bool, optional): se True, imprime a reta que conecta
        o ativo livre de risco com portfólio de máximo índice de Sharpe.
        Padrão: False.
        show_ew (bool, optional): se True, imprime o portfólio de pesos
        iguais. Padrão: False.
        show_gmv (bool, optional): se True, imprime o GVM portfólio.
        Padrão: False.
        style (str, optional): estilo da linha. Padrão: '.-'.
        size (tuple, optional): tamanho do plot. Padrão: (12, 6).
        is_return (bool, optional): se True, retorna o plot, ao invés de
        apenas imprimí-lo. Padrão: False.
        save (bool, optional): se True, salva o plot em save_path com
        nome de gen_portfolios.png. Padrão: False.

    Returns:
        se is_return = True, retorna um ax do matplotlib.
    """
    weights = optimal_weights(exp_rets, cov, n_points)

    rets = [(1 + exp_rets.dot(w))**.5 - 1 for w in weights]
    vols = [vol(w, cov, False) for w in weights]

    ef = pd.DataFrame({'Retornos': rets, 'Volatilidade': vols})
    ax = ef.plot.line(x='Volatilidade',
                      y='Retornos',
                      style=style,
                      figsize=size,
                      legend=False)

    plt.ylabel('Retorno')

    if show_ew:
        n = exp_rets.shape[0]
        w_ew = np.repeat(1 / n, n)
        r_ew = (1 + exp_rets.dot(w_ew))**.5 - 1
        v_ew = vol(w_ew, cov, False)

        ax.plot([v_ew], [r_ew],
                color='goldenrod',
                marker='o',
                markersize=10,
                label='EW')

    if show_gmv:
        w_gmv = gmv(cov)
        r_gmv = (1 + exp_rets.dot(w_gmv))**.5 - 1
        v_gmv = vol(w_gmv, cov, False)

        ax.plot([v_gmv], [r_gmv],
                color='midnightblue',
                marker='o',
                markersize=10,
                label='GMV')

    if show_cml:
        ax.set_xlim(left=0)

        w_msr = maximize_sr(exp_rets, cov, risk_free_rate)
        r_msr = (1 + exp_rets.dot(w_msr))**.5 - 1
        v_msr = vol(w_msr, cov, False)

        # add capital market line
        cml_x = [0, v_msr]
        cml_y = [risk_free_rate, r_msr]

        ax.plot(cml_x,
                cml_y,
                color='green',
                marker='o',
                linestyle='dashed',
                markersize=10,
                linewidth=2,
                label='Cap. Market Line')

    plt.legend()

    if save:
        plt.savefig(save_path + 'gen_portfolios.png', dpi=200)

    return ax if is_return else plt.show()
Beispiel #8
0
def _measure_cos_sim(train_set: pd.DataFrame, test_set: pd.DataFrame):
    similarity_matrix = test_set.dot(train_set.transpose())
    return similarity_matrix
Beispiel #9
0
class VSVM(object):
    data = []  # Raw sentences
    __labels__ = []  # Labels for sentences
    vect_data = []  # Vectorized sentences

    x_test = []  # FOR
    y_test = []  # TESTING
    x_test_text = []  # Raw test sentences

    # Loading of pre-trained model and some necessary things
    def __init__(self):
        with open('TFIDFVectModel.pkl', 'rb') as input_file:
            self.__vect = load(input_file)  # Vectorizer model

        with open('TFIDFMatrix.pkl', 'rb') as input_file:
            self.__tf_idf = load(input_file)  # TF-IDF matrix

        with open('Dictionary.pkl', 'rb') as input_file:
            self.__dict = load(input_file)  # Dictionary of all words in matrix

    def __repr__(self):
        return (f'VSvm,\n' f'{self.vect()},\n' f'{self.model()}.')

    #Returns fited vectorizer model
    def vect(self):
        return (self.__vect)

    #Returns filled Tf-Idf matrix
    def matr(self):
        return (self.__tf_idf)

    #Upload fited svm model
    def upload_model(self, path='GermanSVMModel2-3.pkl'):
        try:
            with open(path, 'rb') as input_file:
                self.__model = load(input_file)
        except Exception as ex:
            raise ex

    #Returns fited svm model (before it use upload_model)
    def model(self):
        try:
            return (self.__model)
        except Exception as ex:
            raise ex

    #Builds TF-IDF matrix on given labeled data, also creates vectorizer model and dictionary
    #Takes two lists of strings(one with some sentences, another with their labels) and
    #list with stop-words suitable for laguage of sentences
    def transform(self, sentences_list, labels, stop_words):

        # Converts all labels to string type
        try:
            labels = [str(x) for x in labels]
        except TypeError as te:
            te.args = [
                'Type error exception occured.\nPlease check your \'labels\' variable. It should be list type, not numeral'
            ]
            raise te

        # Refreshes class field with new labels
        self.__labels__ = labels

        # Creates list with unique labels
        tags = [el for el, _ in groupby(sorted([tag for tag in labels]))]

        # Creates dictionary and fills it with empty space
        sentences_by_tag = dict()
        for tag in tags:
            sentences_by_tag[tag] = []
        try:
            if len(sentences_list) != len(labels):
                warnings.warn(
                    'For correct results of algorithm variables \'sentences_list\' and \'labels\''
                    'should be the same length')
        except TypeError as te:
            te.args = [
                'Type error exception occured.\nPlease check your \'sentences_list\' variable. It should be list type, not numeral'
            ]
            raise te

        # Fills dictionary from 10 strings abow with all sentences for each label
        for i in zip(sentences_list, labels):
            sentences_by_tag[i[1]].append(i[0])

        # Variable for all sentences, sorted by their label
        documents = [' '.join(x) for x in sentences_by_tag.values()]

        # Removes everything except letters and tabs and spaces from variable
        for i in range(len(documents)):
            documents[i] = re.sub(r'[^\w\s]+|[\d]+',
                                  "",
                                  documents[i],
                                  flags=re.UNICODE)

        # Variable for Tf-Idf vectorizer
        self.__vect = TfidfVectorizer(lowercase=True,
                                      ngram_range=(2, 3),
                                      stop_words=stop_words)

        # Creates and fills Tf-Idf matrix with all sentences (documents)
        try:
            matrix = self.__vect.fit_transform(documents)

        except ValueError as ve:
            ve.args = [
                'Value error exception occured.\nPlease check your variables \'sentences_list\' and \'labels\'.'
                'They Should be not empty.\nAlso they should be list type.\n'
                '\'sentences_list\' should contain something that contains not only numerals'
            ]
            raise ve

        except TypeError as te:
            te.args = [
                'Type error exception occured.\nPlease make sure that your stop-words variable is a list'
            ]
            raise te

        except MemoryError as me:
            me.args = [
                'Memory error exception occured.\nPlease check your pagefile size and increase it'
            ]
            raise me

        # Dictionary - like variable that contains positions of each word in Tf-Idf matrix
        positions = {}
        j = 0
        for word in self.__vect.get_feature_names():
            positions[word] = j
            j = j + 1

        # Refreshes class field with new data
        self.__dict = positions
        self.__tf_idf = DataFrame(matrix.toarray(),
                                  columns=positions.keys(),
                                  index=sentences_by_tag.keys())

    #Fits SVC model from sklearn using existing TF-IDF matrix
    #Takes two lists of strings (one (x) with some sentences, another (y) with their labels)
    def fit(self, x, y):

        # Refreshes class field with new data
        self.data = x

        # Run function that transformates sentences to vectors with length of number of labels
        self.__create_vectors()

        # Variable for svm model
        svm = SVC(C=10, kernel='linear', probability=True)

        # Fits model
        try:
            svm.fit(self.vect_data, y)
        except TypeError as te:
            te.args = [
                'Type error exception occured.\n'
                'Probably your answers (y variable) contains objects of different types.\n'
                'Please check it out and if the suspicions are confirmed - lead objects to the same type.\n'
                'Also the error can be occured if \'y\' variable is empty'
            ]
            raise te

        # Refreshes class field with new data
        self.__model = svm

    #Builds TF-IDF matrix on given labeled data, then fits SVC model using it
    #Takes two lists of strings(one with some sentences, another with their labels) and
    #list with stop-words suitable for laguage of sentences
    def fit_transform(self, sentences_list, labels, stop_words):

        # Refreshes class field with new data and runs transformtion function
        try:
            self.__labels__ = labels

            # Runs function that fits TF-IDF vectorizer and creates TF-IDF matrix from input data
            self.transform(sentences_list, self.__labels__, stop_words)
        except Exception as ex:
            raise ex

        # Splits data into test and train parts
        try:
            x_train, x_test, y_train, y_test = train_test_split(
                sentences_list,
                self.__labels__,
                test_size=0.125,
                random_state=228)
        except ValueError as ve:
            ve.args = [
                f'\'sentences_list\' and \'labels\' should be the same size\n'
                f'Got {len(sentences_list)} and {len(self.__labels__)}'
            ]
            raise ve

        self.y_test = y_test
        self.data = x_test
        self.x_test_text = self.data.copy()
        self.x_test_text = self.x_test_text.reset_index(drop=True)

        # Runs function that transformates sentences to vectors with length of number of labels
        try:
            self.__create_vectors()
        except Exception as ex:
            return (logging.error(ex, exc_info=True))

        self.x_test = self.vect_data

        # Runs function that fits svm model with vectorized data and labels for them
        try:
            self.fit(x_train, y_train)
        except Exception as ex:
            raise ex

    #Labels given data (list of sentences) based on the TF-IDF matrix
    #Takes one list of strings(with some sentences)
    def predict(self, sentences_list):
        self.data = sentences_list
        self.__create_vectors()
        return (DataFrame(self.data, columns=["sentence"]).join(
            DataFrame(self.__model.predict(self.vect_data),
                      columns=["prediction"])))

    #Gives the list of vectors of probability in labels
    #(each vector is a set of numbers from 0 to 1)
    #Takes one list of strings(with some sentences)
    def predict_proba(self, sentences_list):
        self.data = sentences_list
        self.__create_vectors()
        return self.__model.predict_proba(self.vect_data)

    #Labels given data (list of sentences) based on the probability list
    #(use the same list of sentences as in prediction of the probability)
    #Takes a list of strings(with some sentences), a list of vectors (with probability of labels) and can take the probability threshold and the name of the trash label (tag)
    def interpretate_proba(self,
                           sentences_list,
                           proba,
                           threshold=0.0969,
                           trash_tag='000'):

        # Creates list of interpretated labels
        interp = []

        # Fills list with labels with the maximum probability and passing threshold
        try:
            i = 0
            for i in range(len(proba)):
                maximum = max(proba[i])
                if (maximum > threshold):
                    interp.append(self.__model.classes_[list(
                        proba[i]).index(maximum)])
                else:
                    interp.append(trash_tag)
        except LookupError as le:
            le.args = [
                f'Vectors in \'proba\' and model classes should be the same size\n'
                f'Got {len(proba[i])} and {len(self.__model.classes_)}'
            ]
            raise le

        interpretated = DataFrame(sentences_list)
        interpretated.columns = ['sentence']
        return interpretated.join(DataFrame(interp, columns=['label']))

    #Prints information about fited model, such as accuracy, precision, recall and information about multiclass classification
    def class_rep(self):
        print(
            classification_report(self.y_test,
                                  self.model().predict(self.x_test),
                                  target_names=self._labels_))

    #Draw graphics that visualize correlation between human labeled text and machine labeled ones
    #Takes Pandas DataFrame  that contains some corpuses in Manifesto standart view(with columns:
    #document name(contains date, country, party) and code)
    def visualize_pearson(self, data):

        # Splits data into test and train parts (train part dropped)
        _, x_test, _, y_test = train_test_split(data['doc_name'],
                                                data['code'],
                                                test_size=0.125,
                                                random_state=228)
        y_test.reset_index(drop=True, inplace=True)

        # Prepares data as annotated by human or computer
        hum_annot = DataFrame(y_test)
        hum_annot['doc_name'] = x_test.reset_index(drop=True)
        hum_annot.columns = ['code', 'doc_name']

        comp_annot = DataFrame(self.model().predict(self.x_test))
        comp_annot['doc_name'] = x_test.reset_index(drop=True)
        comp_annot.columns = ['code', 'doc_name']

        # Draws Pearson correlation
        self.__pears(data['doc_name', 'code'], comp_annot, hum_annot)

    #Downlaods some manifesto texts in specific format: csv file, header: manifesto_name, content, label
    #You can find them in C:\Users\User\ManifestoDetails\
    #Takes dictionary with some of that params:
    #     params = {
    #     'language': 'german',
    #     'election_date': '2017'
    # }
    def get_manifesto_texts(self, params):

        # Check current versions of cores and load them from their folder
        cores = self.__get_current_cores()

        # Check existence of meta data file
        try:
            open(r'ManifestoDetails\meta.csv')

        except Exception as ex:
            ex.args = [
                'No meta file found! Please use function get_manifesto_meta or check ManifestoDetails folder'
            ]

        # If cores and meta were find downloads texts from Manifesto database to ManifestoDetails folder
        if cores != []:

            ManifestoAPI.get_texts(params, r'ManifestoDetails\meta.csv',
                                   r'ManifestoDetails\annot.csv',
                                   r'ManifestoDetails\not_annot.csv')

            print(
                'Texts were successfully download. You can find them in your user folder, ManifestoDetails, annot.csv and not_annot.csv'
            )
        else:
            print(
                'No cores found! Please download some fo them using method \'get_manifesto_cores\''
            )

    #Downloads some manifesto meta data that uses to download some texts.
    #You can find it in C:\Users\User\ManifestoDetails\meta.csv
    #Takes dictionary with some of that params:
    #     params = {
    #      "countryname": "Germany"
    # }
    def get_manifesto_meta(self, params):

        # Check current versions of cores and load them from their folder
        cores = self.__get_current_cores()

        # If cores were find downloads meta data from Manifesto database to ManifestoDetails folder
        if cores != []:

            ManifestoAPI.get_manifesto_metadata(params, cores,
                                                r'ManifestoDetails\meta.csv')

            print(
                'Meta data was successfully download.'
                'You can find them in your user folder, ManifestoDetails, meta\.csv'
            )
        else:
            print(
                'No cores found! Please download some fo them using method \'get_manifesto_cores\''
            )

    #Downloads some manifesto cores of specific versions
    #exmaple of versions: ['MPDS2018b', 'MPDSSA2018b']
    #You can get full list of current versions by using "get_core_versions"
    def get_manifesto_cores(self, versions):
        try:
            for i in versions:
                ManifestoAPI.api_get_core(i, kind='xlsx')
        except Exception as ex:
            ex.args = [
                'Please make sure that you gave correct versions of manifesto cores. To make sure use \'get_core_versions\' method'
            ]

    #Prints all current manifesto core versions
    def get_core_versions(self):
        print(ManifestoAPI.api_list_core_versions())

    #Check folder of cores and returns all that were find
    def __get_current_cores(self):

        # Directory for Manifesto cores
        pdir = r'ManifestoDetails\cores'

        # List for all files in folder
        contdir = []

        # "Walking" through content of directory and saving all file names
        for i in walk(pdir):
            contdir.append(i)

        # Grabing useful information(core versions) from walking result
        cores = []
        for i in contdir[0][2]:
            cores.append(re.split('\.', i)[0])
        return (cores)

    #Function that draws Pearson correlation chart for computer and human annotated texts
    #Takes Pandas DataFrame that contains some corpuses in Manifesto standart view(with columns that contains such info as
    #document name and code) and
    #two other Pandas DataFrame that contains some pairs of document and his label(code), one of them contains algorithm predictions and
    #another - human labels
    def __pears(self, data, comp_annot, hum_annot):

        # Creates and fills lists with label frequency in document
        comp = []
        hum = []
        for i in data['doc_name'].unique():
            for j in data['code'].unique():
                comp.append(comp_annot[(comp_annot.doc_name == i)
                                       & (comp_annot.code == j)].shape[0])
                hum.append(hum_annot[(hum_annot.doc_name == i)
                                     & (hum_annot.code == j)].shape[0])

        # Draws Pearson correlation
        pear = sns.jointplot(x=array(comp), y=array(hum), kind='reg')
        pear = pear.set_axis_labels("computer-annotated", "human-annotated")
        pear = pear.annotate(stats.pearsonr)
        print(pear)

    #Changes the vector's length to 1
    #Takes list with n floats, where n is number of labels in Your data
    def __normalize(self, vector):
        normed = linalg.norm(vector)
        if normed == 0:
            return vector
        return vector / normed

    #Converts sentence to an array of float numbers using the TF-IDF matrix
    #Takes one sentence that must be string type.
    #Also you need to run transform method before using this one
    def __to_vector(self, sentence):

        # Variable for computing likelihood of entry into the label of sentence
        vector = zeros(len(self.__dict))

        # Check type of input data
        if isinstance(sentence, str):

            # Splits sentence into words
            splitted_sentence = simple_preprocess(str(sentence), deacc=True)

            # Removes empty spaces
            splitted_sentence = list(filter(None, splitted_sentence))

            # If sentence have two or more words tries to find each phrase from two words in TF-IDF vocabulary
            # and increments appropriate position
            if len(splitted_sentence) > 1:
                for c in range(len(splitted_sentence) - 1):
                    pair = splitted_sentence[c] + ' ' + splitted_sentence[c +
                                                                          1]
                    try:
                        position = self.__dict[pair]
                        vector[position] += 1
                    except KeyError:
                        continue
            # If sentence have three or more words tries to find each phrase from two words in TF-IDF vocabulary
            # and increments appropriate position
            if len(splitted_sentence) > 2:
                for c in range(len(splitted_sentence) - 2):
                    tripl = splitted_sentence[c] + ' ' + splitted_sentence[
                        c + 1] + splitted_sentence[c + 2]
                    try:
                        position = self.__dict[tripl]
                        vector[position] += 1
                    except KeyError:
                        continue

        return (self.__normalize(array(self.__tf_idf.dot(vector))))

    #Runs To_vector for a list of sentences (which is supposed to be in self.data)
    #Takes nothing, but You need to put some data in exemplar of that class or just run transform or fit_transform with correct args
    #before using this method
    def __create_vectors(self):

        # Variable for vectorized sentences
        vectors = []

        # Variable for active progress bar
        j = 1

        # Variable for amount of sentences
        x = len(self.data)

        # Construction for active progress bar
        print("Vectorization in progress:")

        # Vectorizing each sentence
        for i in self.data:

            vectors.append(self.__to_vector(i))
            line = str(j) + '/' + str(x)
            print(line, end="\r")
            j += 1
        # Refreshes class field with new data
        self.vect_data = vectors
def epochs_sim_agg_returns_pair_data(dataframe: pd.DataFrame,
                                     normalized: bool = False) -> List[float]:
    """Uses local normalization to compute the aggregated distribution of
       returns for a pair of simulated stocks.

    :param dataframe: dataframe with the simulated returns.
    :type dataframe: pd.DataFrame
    :param normalized: normalize the returns within the epochs, defaults to
     False
    :type normalized: bool, optional
    :return: simulated rotated returns.
    :rtype: List[float]
    """

    if normalized:
        dataframe = (dataframe - dataframe.mean()) / dataframe.std()

    cov_two_col: pd.DataFrame = dataframe.cov()
    # eig_vec:  eigenvector, eig_val: eigenvalues
    eig_val_corr: np.ndarray
    eig_vec_corr: np.ndarray
    eig_val_corr, eig_vec_corr = np.linalg.eigh(cov_two_col)

    # rot: rotation, scale: scaling
    rot: np.ndarray
    scale: np.ndarray
    rot, scale = eig_vec_corr, np.diag(1 / np.sqrt(eig_val_corr))
    # trans: transformation matrix
    # trans = rot . scale
    trans: np.ndarray = rot.dot(scale)

    try:
        # Transform the returns
        trans_col: pd.DataFrame = dataframe.dot(trans)
        # Length DataFrame
        col_length: int = len(trans_col.columns)
        # Name the columns with the used stocks
        trans_col.columns = [f"Stock_{i}" for i in range(col_length)]

        one_col: List[Any] = []

        for idx in range(col_length):
            one_col.append(trans_col[f"Stock_{idx}"])

        agg_ret_mkt_series: pd.Series = pd.concat(one_col, ignore_index=True)

        del one_col
        del trans_col

    except np.linalg.LinAlgError as error:
        print(error)
        print()

        del one_col
        del trans_col

    # remove NaN and Inf
    agg_ret_mkt_list: List[float] = [
        x for x in agg_ret_mkt_series
        if not math.isnan(x) and not math.isinf(x)
    ]
    # filter out values greater than 10 or smaller than -10
    agg_ret_mkt_list = [x for x in agg_ret_mkt_list if -10 <= x <= 10]

    return agg_ret_mkt_list
Beispiel #11
0
def plot_eff(
    exp_rets: pd.DataFrame, cov: pd.DataFrame,
    n_points: int, risk_free_rate: float=.1,
    show_cml: bool=False, show_ew: bool=False,
    show_gmv: bool=False, style: str='.-',
    size: tuple=(12, 6), is_return: bool=False,
    save: bool=False
):
    """
    Plots the N-asset efficient frontier.
    """
    weights = optimal_weights(exp_rets, cov, n_points)

    rets = [(1 + exp_rets.dot(w)) ** .5 - 1 for w in weights]
    vols = [vol(w, cov, True) for w in weights]

    ef = pd.DataFrame({'Retornos': rets, 'Volatilidade': vols})
    ax = ef.plot.line(x='Volatilidade', y='Retornos', style=style, figsize=size)

    if show_ew:
        n = exp_rets.shape[0]
        w_ew = np.repeat(1/n, n)
        r_ew = (1 + exp_rets.dot(w_ew)) ** .5 - 1
        v_ew = vol(w_ew, cov, True)

        ax.plot([v_ew], [r_ew], color='goldenrod', marker='o', markersize=10)

    if show_gmv:
        w_gmv = gmv(cov)
        r_gmv = (1 + exp_rets.dot(w_gmv)) ** .5 - 1
        v_gmv = vol(w_gmv, cov, True)

        ax.plot([v_gmv], [r_gmv], color='midnightblue', marker='o', markersize=10)

    if show_cml:
        ax.set_xlim(left=0)

        w_msr = maximize_sr(exp_rets, cov, risk_free_rate)
        r_msr = (1 + exp_rets.dot(w_msr)) ** .5 - 1
        v_msr = vol(w_msr, cov, True)

        # add capital market line
        cml_x = [0, v_msr]
        cml_y = [risk_free_rate, r_msr]

        ax.plot(
            cml_x,
            cml_y,
            color='green',
            marker='o',
            linestyle='dashed',
            markersize=10,
            linewidth=2
        )

    if save:
        plt.savefig(save_path + 'gen_portfolios_hd.png', dpi=200)

    if not is_return:
        plt.show()
    else:
        return ax