예제 #1
0
def main():

    # Read in data from CSV

    df = pd.read_csv('PurchaseOrderDetail.csv')

    a = Adjacency(df, 'RejectedQty')

    a.calcandhtml() #optional parameter value 'Y' for printing
예제 #2
0
    def adjmatrix_eucliddist(self, htmlfilename):

        # Find full-D distances between texts
        n = euclidean_distances(self.dtm)

        # make list of all possible comparisons and result
        distarray=[]
        for c in combinations(list(range(len(n))), 2):
            distarray.append([c[0], c[1], n[c[0],c[1]]])

        distarray=np.array(distarray)

        names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames]

        Adjacency.createhtml(twodarray=distarray, colnames=names, htmlfilename=htmlfilename, printjson=None, plottitle="Euclidian distance for doc vector pairs")
예제 #3
0
    def adjmatrix_docauthttest(self, htmlfilename):

        vect = TfidfVectorizer(tokenizer=self.tokenize, input='filename', stop_words='english')
        full = vect.fit_transform(self.filenames).toarray()

        listy = list(range(len(self.filenames))) # create list representing each doc for all poss combos

        pscoresarray=[]
        #for j in range(0,len(combos)):
        for c in combinations(listy, 2):
            first = c[0]                               # grabbing all possible doc combos
            second = c[1]

            p = ttest_ind(full[first],full[second])[1] # grabbing p-value (not t-statistic)
            pscoresarray.append([first, second, p])
        pscoresarray = np.array(pscoresarray)

        names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames]

        Adjacency.createhtml(twodarray=pscoresarray, colnames=names, htmlfilename=htmlfilename, printjson=None, plottitle="P-value for document vector pairs")