def main(): # Read in data from CSV df = pd.read_csv('PurchaseOrderDetail.csv') a = Adjacency(df, 'RejectedQty') a.calcandhtml() #optional parameter value 'Y' for printing
def adjmatrix_eucliddist(self, htmlfilename): # Find full-D distances between texts n = euclidean_distances(self.dtm) # make list of all possible comparisons and result distarray=[] for c in combinations(list(range(len(n))), 2): distarray.append([c[0], c[1], n[c[0],c[1]]]) distarray=np.array(distarray) names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames] Adjacency.createhtml(twodarray=distarray, colnames=names, htmlfilename=htmlfilename, printjson=None, plottitle="Euclidian distance for doc vector pairs")
def adjmatrix_docauthttest(self, htmlfilename): vect = TfidfVectorizer(tokenizer=self.tokenize, input='filename', stop_words='english') full = vect.fit_transform(self.filenames).toarray() listy = list(range(len(self.filenames))) # create list representing each doc for all poss combos pscoresarray=[] #for j in range(0,len(combos)): for c in combinations(listy, 2): first = c[0] # grabbing all possible doc combos second = c[1] p = ttest_ind(full[first],full[second])[1] # grabbing p-value (not t-statistic) pscoresarray.append([first, second, p]) pscoresarray = np.array(pscoresarray) names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames] Adjacency.createhtml(twodarray=pscoresarray, colnames=names, htmlfilename=htmlfilename, printjson=None, plottitle="P-value for document vector pairs")