def reference_update(request): errors = [] warnings = [] valid = False inputFile = None referenceTableHTML = None sampleFileHTML = None outFolderName = '../out/' fileType = '.xlsx' sheetName = 'Sheet1' # Submit File View Setup if(request.method == "POST" and request.POST.get('upload')): form = UploadFileForm(request.POST, request.FILES) inputFile = request.FILES['file'] if form.is_valid(): inputFile = request.FILES['file'] targetSheetName = request.POST.get('target-sheet') inputFileDF = pd.read_excel(inputFile, targetSheetName) dbq.insertAssigneeKeywords(inputFileDF) try: if(inputFile): inputFile.close() except Exception: valid = False errors.append("The process cannot access the input file because it is being used by another process.") valid = True templateHTML = 'parola_refine/index.html' mainHTML = render_to_string( templateHTML, { }) return mainHTML # Submit File View elif(request.method == "POST" and request.POST.get('process')): targetColumnName = request.POST.get('target-column') # Default View else: form = UploadFileForm() templateHTML = 'parola_refine/reference_update.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'errors': errors, 'warnings': warnings, 'inputFile': inputFile, 'sampleFileHTML': sampleFileHTML, }) return mainHTML
def index(request): df = pd.read_excel('../out/Assignee Sectors and Industries.xlsx', 'Sheet1') dbq.updateAssigneesRankSectorIndustry(df) # dbq.insertCPCDescriptions() dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.sort() templateHTML = 'data_sets/index.html' mainHTML = render_to_string(templateHTML, { 'dataSetNames': dataSetNames, }) return mainHTML
def index(): query = "" processed_text1 = "" response2 = "" resultheading = "" #Debug #time.sleep(5) page, per_page, offset = get_page_items() total = 0 pagination = get_pagination( page=page, per_page=per_page, total=total, format_total=True, format_number=True, ) if request.method == 'GET': if 'q' in request.args: app.logger.debug("query from user ===> %s<===", request.args['q']) #Sanitize & Remove trailing space query = bleach.clean(request.args['q']).strip() app.logger.debug("query from user after bleach ===> %s<===", query) #Start: Uncomment to trigger slow response time #app.logger.debug ("sleeping .....") #time.sleep(15) #app.logger.debug ("awake .....") #End: Uncomment to trigger slow response time (total, resultheading, processed_text1, response2) = DBQueries.ProcessQuery(query, offset, per_page) pagination = get_pagination( page=page, per_page=per_page, total=total, format_total=True, format_number=True, record_name='repositories', ) if (processed_text1 == "EMPTY"): t1 = Suggestions.compare("now") if ( query == "") else Suggestions.compare(query) processed_text1 = NORESULT + t1 else: query = "" processed_text1 = "" response2 = "" return render_template("index-bootstrap.html", page=page, total=total, per_page=per_page, pagination=pagination, title='Ask GitHub', showGAcode=os.environ['showGAcode'], appenv=os.environ['deployEnv'], query=[{ "text": query }], resultheading=resultheading, response2=response2, processed_text=processed_text1)
def view_data_set(request, data_set): df = dbq.getDataSetPatents(data_set) dataHTML = df.head(50).to_html() return render(request, 'data_sets/view.html', { 'data_set': data_set, 'dataHTML': dataHTML })
def assignee_trends(request, date): dates = [] previousDate = None areaData = None targetN = 9 if (request.method == "POST"): targetN = int(request.POST.get('target-n')) df = pd.DataFrame() df = dbq.getDataSetPatentsBySource(df, 'uspto') assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry() for d in list(set(df['Dates'].tolist())): dates.append(str(d)) dates.sort(reverse=True) date = dates[0] previousDate = dates[1] # Area chart data preparation dfCopy = df.copy() dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries, 'uspto') dfCopy = dfCopy.dropna(subset=['Sectors']) areaData = dfCopy.groupby([ 'Dates', 'Current Assignees' ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA') grouped = dfCopy.groupby(['Current Assignees' ]).size().reset_index(name='nPPA') topNAssignees = grouped.nlargest(targetN, 'nPPA')['Current Assignees'].tolist() areaData = areaData[areaData['Current Assignees'].isin(topNAssignees)] areaData = areaData.rename( index=str, columns={'Current Assignees': "CurrentAssignees"}) areaData = areaData.to_json(orient="index") templateHTML = 'uspto/assignee_trends.html' mainHTML = render_to_string(templateHTML, { 'date': date, 'dates': dates, 'areaData': areaData, 'targetN': targetN, }) return mainHTML
def tsearch(name=None): query = bleach.clean(request.args['q']).strip() #print query #Minimum 5 characters for query if (len(query) <= 4): t = [] #return nothing! else: t = DBQueries.Typeahead(query) return make_response(dumps(t))
def getCPCDescriptions(df): CPCDescriptions = [] descriptions = dbq.getAllCPCDescription() for index, row in df.iterrows(): descriptionsString = '' cpc = row['MAIN CPC'] finalCPC = str(cpc)[0:4] + str(cpc)[5:str(cpc).find('/')].replace( '0', '') + str(cpc)[str(cpc).find('/') + 1:].replace('/', '') description = descriptions.loc[descriptions['cpc'] == finalCPC]['description'] if (len(description) > 0): descriptionsString = description.tolist()[0] CPCDescriptions.append(descriptionsString) return CPCDescriptions
def getCPCDescription(CPCs): CPCDescriptions = [] descriptions = dbq.getAllCPCDescription() for cpc in CPCs: descriptionsString = '' finalCPC = ( str(cpc)[0:4] + str(cpc)[5:str(cpc).find('/')].replace('0', '') + str(cpc)[str(cpc).find('/') + 1:].replace('/', '')).upper() description = descriptions.loc[descriptions['cpc'] == finalCPC]['description'] if (len(description) > 0): descriptionsString = description.tolist()[0] CPCDescriptions.append(descriptionsString) return CPCDescriptions
def getCPCListDescriptions(df): CPCDescriptions = [] descriptions = dbq.getAllCPCDescription() for index, row in df.iterrows(): descriptionsList = [] descriptionsString = '' cpcList = str(row['CPC']).splitlines() for cpc in cpcList: finalCPC = cpc[0:4] + cpc[5:cpc.find('/')].replace( '0', '') + cpc[cpc.find('/') + 1:].replace('/', '') description = descriptions.loc[descriptions['cpc'] == finalCPC]['description'] if (len(description) > 0): descriptionsString = description.tolist()[0] descriptionsList.append(descriptionsString) descriptionsString = str(descriptionsList) CPCDescriptions.append(descriptionsString) return CPCDescriptions
def category_statistics(request, data_set, classification, category): errors = [] warnings = [] form = None valid = False hasDataSet = False hasCategory = False nPPA = None categoryList = None selectedCategory = str(category).replace('_', '/') selectedCPCDescription = None selectedClass = '' selectedClassificationDisplay = '' maxYear = None assigneeData = None yearData = None targetAssigneeColumnName = "CA" targetYearColumnName = "YEAR" targetCategoryColumnName = "CATEGORY" previousNYears = 20 # Data set selection view dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') classificationNames = dbq.getClassificationList() classificationNames.insert(0, 'index') # Category selection view if (not data_set == 'index' and not classification == 'index' and selectedCategory == 'index'): df = pd.DataFrame() df = dbq.getDataSetPatentColumn(data_set, df, classification) selectedClass = df[classification].tolist() nPPA = len(df.index) allClass = [] for cList in selectedClass: if (cList and cList == cList): for c in cList: if (c and c == c and c != 'nan' and c != 'NAN'): allClass.append(c.lstrip().rstrip()) categoryList = sorted(list(set(allClass))) categoryList.insert(0, 'index') hasDataSet = True # Graph preparations elif (not data_set == 'index' and not classification == 'index' and not selectedCategory == 'index'): if (request.method == "POST"): previousNYears = int(request.POST.get('target-n-years')) df = pd.DataFrame() df = dbq.getDataSetPatentYears(data_set, df) nPPA = len(df.index) df = dbq.getDataSetPatentAssignees(data_set, df) df['Current Assignees'] = df['Clean Assignees'].tolist() df = dbq.getDataSetPatentTypes(data_set, df) df = dbq.getDataSetPatentColumn(data_set, df, classification) selectedClass = df[classification].tolist() years = df['Years'] maxYear = max(years) minYear = maxYear - previousNYears + 1 df = df[df.Years >= minYear] years = df['Years'] CAs = df['Current Assignees'] types = df['Types'] allClass = [] allYears = [] allCAs = [] allTypes = [] for year, CA, cList, patentType in zip(years, CAs, selectedClass, types): if (cList == cList and cList != None): for c in cList: if (c and c == c and c != 'nan'): allClass.append(c.lstrip().rstrip()) allYears.append(year) allCAs.append(CA) allTypes.append(patentType) categoryList = sorted(list(set(allClass))) if (selectedCategory in categoryList): expandedDF = pd.DataFrame() expandedDF[targetCategoryColumnName] = allClass expandedDF['Years'] = allYears expandedDF[targetAssigneeColumnName] = allCAs expandedDF['Types'] = allTypes expandedDF.drop( expandedDF[expandedDF[targetCategoryColumnName] != selectedCategory.lstrip().rstrip()].index, inplace=True) nPPA = len(expandedDF.index) # Year Bar Graph Data yearData = pd.crosstab(expandedDF['Years'], [expandedDF['Types']]).reset_index() yearData = yearData.rename(index=str, columns={'Years': "Categories"}) yearData = yearData.to_json(orient="index") # yearData = cim.getCrossTabInput(expandedDF, 'Years', 'Types', 10) # Assignee Bar Graph Data assigneeData = cim.getCrossTabInput(expandedDF, targetAssigneeColumnName, 'Types', 10) hasDataSet = True hasCategory = True valid = True else: categoryList.insert(0, 'index') category = selectedCategory = 'index' hasDataSet = True hasCategory = False templateHTML = 'visualization/category_statistics.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'hasDataSet': hasDataSet, 'hasCategory': hasCategory, 'classification': classification, 'selectedClassificationDisplay': selectedClassificationDisplay, 'valid': valid, 'errors': errors, 'warnings': warnings, 'nPPA': nPPA, 'dataSetNames': dataSetNames, 'classificationNames': classificationNames, 'categoryList': categoryList, 'data_set': data_set, 'category': category, 'selectedCategory': selectedCategory, 'selectedCPCDescription': selectedCPCDescription, 'maxYear': maxYear, 'yearData': yearData, 'assigneeData': assigneeData, 'previousNYears': previousNYears, }) return mainHTML
def landscape_map(request, data_set): errors = [] warnings = [] form = None valid = False hasDataSet = False columnList = None dataSetNames = None topicsTableHTML = None stringUniqueAllTopics = None n_top_documents = 3 documentsPerTopic = [] selectedDataSet = data_set uniqueStringFeatureNames = None tsneData = None # colorList = ["#2D777F", "#173B40", "#2b9ca5", "#7cc9d2", "#5BEEFF", "#51D6E5", "#44B2BF", "#e5f2f9", "#E9EDDE", "#e2e2e2"] # colorList = ["#5BEEFF", "#d62728", "#2b9ca5", "#ff7f0e", "#756bb1", "#66aa00", "#1f77b4", "#bcbd22"] colorList = [ "#ed7d2e", "#258296", "#6cc8db", "#194660", "#316e8c", "#2D777F", "#1f77b4", "#8a8a8a", "#756bb1", "#bcbd22" ] outFolderName = '../templates/visualization/' fileType = '.xlsx' scatterFileName = 'landscapeMapInput' scatterFileType = '.tsv' targetTitlesColumnName = 'Titles' targetAbstractsColumnName = 'Abstracts' targetIndependentClaimsColumnName = 'Independent Claims' dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') # Model setup view if (not selectedDataSet == 'index' and not request.method == "POST"): hasDataSet = True # Graph preparation elif (request.method == "POST"): n_top_words = 12 df = pd.DataFrame() df = dbq.getDataSetPatentTACs(data_set, df) if (len(df.index) > 1000): # df = df.sample(n=500, replace=False) df = df.sample(frac=0.1, replace=False) df = df.dropna(subset=[targetTitlesColumnName]) df = df.dropna(subset=[targetAbstractsColumnName]) df = df.dropna(subset=[targetIndependentClaimsColumnName]) targetFeatures = request.POST.getlist('features') targetMethod = request.POST.get('method') targetNumberOfComponents = int(request.POST.get('target-n-components')) targetNGram1 = int(request.POST.get('target-n-gram-1')) targetNGram2 = int(request.POST.get('target-n-gram-2')) wordList = [] f = open('../out/words.txt', 'r') for line in f: wordList.append(line.rstrip()) f.close() featuresDF = pd.DataFrame() cleanTitles = cc.removePublicationNumbers(df[targetTitlesColumnName]) # featuresDF[targetTitlesColumnName] = pp.normalizeCorpusAsStrings(cleanTitles, wordList) # featuresDF[targetAbstractsColumnName] = pp.normalizeCorpusAsStrings(df[targetAbstractsColumnName], wordList) # featuresDF[targetIndependentClaimsColumnName] = pp.normalizeCorpusAsStrings(df[targetIndependentClaimsColumnName], wordList) if ('titles' in targetFeatures): temp = cc.removePublicationNumbers(df[targetTitlesColumnName]) temp = pp.normalizeCorpusAsStrings(temp, wordList) df[targetTitlesColumnName] = cc.remove2LetterWords(temp) featuresDF[targetTitlesColumnName] = df[targetTitlesColumnName] if ('abstracts' in targetFeatures): temp = cc.removePublicationNumbers(df[targetAbstractsColumnName]) temp = pp.normalizeCorpusAsStrings(temp, wordList) df[targetAbstractsColumnName] = cc.remove2LetterWords(temp) featuresDF[targetAbstractsColumnName] = df[ targetAbstractsColumnName] if ('independentclaims' in targetFeatures): temp = cc.removePublicationNumbers( df[targetIndependentClaimsColumnName]) temp = pp.normalizeCorpusAsStrings(temp, wordList) temp = cc.remove2LetterWords(temp) df[targetIndependentClaimsColumnName] = cc.removeDigits(temp) featuresDF[targetIndependentClaimsColumnName] = df[ targetIndependentClaimsColumnName] transformerList = [] if ('titles' in targetFeatures): transformerList.append( ('titles', Pipeline([ ('selector', ise.ItemSelector(key=targetTitlesColumnName)), ('vectorizer', TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(targetNGram1, targetNGram2))), ]))) if ('abstracts' in targetFeatures): transformerList.append( ('abstracts', Pipeline([ ('selector', ise.ItemSelector(key=targetAbstractsColumnName)), ('vectorizer', TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(targetNGram1, targetNGram2))), ]))) if ('independentclaims' in targetFeatures): transformerList.append( ('independent claims', Pipeline([ ('selector', ise.ItemSelector(key=targetIndependentClaimsColumnName)), ('vectorizer', TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(targetNGram1, targetNGram2))), ]))) # Model fitting pipeline = Pipeline([ ('union', FeatureUnion(transformer_list=transformerList, )), ('clf', LatentDirichletAllocation(n_components=targetNumberOfComponents, random_state=10, doc_topic_prior=.1, learning_method='online', learning_offset=50, max_iter=random.randint(1, 6))), # ('clf', LatentDirichletAllocation( # n_components=targetNumberOfComponents, # random_state=0, # learning_method='online', # learning_offset=50, # max_iter=5)), # ('clf', NMF(n_components=10, random_state=1, # beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, # l1_ratio=.5)), ]) pipeline.fit(featuresDF) model = pipeline.named_steps['clf'] wordToTopics = model.components_ / model.components_.sum( axis=1)[:, np.newaxis] topicToDocuments = pipeline.transform(featuresDF) words = [] for transformer in pipeline.steps[0][1].transformer_list: words = words + transformer[1].named_steps[ 'vectorizer'].get_feature_names() topicsDF = pd.DataFrame() allTopics = [] stringAllTopics = [] stringUniqueAllTopics = [] # allImportances = [] allDocuments = [] documentsPerTopic = [] for topic_idx, topic in enumerate(wordToTopics): featureNames = [ words[i] for i in topic.argsort()[:-n_top_words - 1:-1] ] stringFeatureNames = str(featureNames).replace("'", '').replace( "[", '').replace("]", '') uniqueStringFeatureNames = list( set(stringFeatureNames.replace(",", '').split(' '))) stringAllTopics.append(stringFeatureNames) stringUniqueAllTopics.append( str(uniqueStringFeatureNames).replace("'", '').replace( "[", '').replace("]", '').replace(",", '')) # featureImportances = [importance for importance in topic.argsort()[:n_top_words]] top_doc_indices = np.argsort( topicToDocuments[:, topic_idx])[::-1][0:n_top_documents] tempDocumentsPerTopic = [] for doc_index in top_doc_indices: allTopics.append(featureNames) # allImportances.append(featureImportances) tempDocumentsPerTopic.append(cleanTitles[doc_index]) allDocuments.append(cleanTitles[doc_index]) documentsPerTopic.append(tempDocumentsPerTopic) topicsDF['Topics'] = allTopics topicsTable = pd.DataFrame() topicsTable['Topics'] = stringUniqueAllTopics topicsTableHTML = topicsTable.to_html(index=False) stringUniqueAllTopics = zip(colorList, stringUniqueAllTopics) documentsPerTopic = zip(colorList, documentsPerTopic) # topicsDF['Importances'] = allImportances topicsDF['Documents'] = allDocuments # topicsDF.to_excel(outFolderName + 'LDA2' + fileType) topWords = [] for topic_idx, topic in enumerate(wordToTopics): featureNames = [ words[i] for i in topic.argsort()[:-n_top_words - 1:-1] ] topWords.append( str(featureNames).replace('[', '').replace(']', '').replace("'", '')) mostProbableTopicsIndex = [0] * len(wordToTopics[0]) mostProbableTopics = [0] * len(wordToTopics[0]) for word_idx, topics in enumerate(np.transpose(wordToTopics)): mostProbableTopicsIndex[word_idx] = np.argmax(topics) mostProbableTopics[word_idx] = topWords[np.argmax(topics)] # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca', perplexity=30, early_exaggeration=5) tsne_model = TSNE(n_components=2, verbose=1, random_state=0) tsne_lda = tsne_model.fit_transform(np.transpose(wordToTopics)) # tsne_lda = tsne_model.fit_transform(topicToDocuments) xCoords = tsne_lda[:, 0] yCoords = tsne_lda[:, 1] tsnseDF = pd.DataFrame() tsnseDF['x'] = xCoords tsnseDF['y'] = yCoords tsnseDF['n'] = mostProbableTopicsIndex tsnseDF['label'] = mostProbableTopics tsnseDF.to_csv(outFolderName + scatterFileName + scatterFileType, sep='\t', index=False) tsneData = tsnseDF.to_json(orient="index") hasDataSet = True valid = True templateHTML = 'visualization/landscape_map.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'hasDataSet': hasDataSet, 'errors': errors, 'warnings': warnings, 'columnList': columnList, 'data_set': data_set, 'dataSetNames': dataSetNames, 'selectedDataSet': selectedDataSet, 'stringUniqueAllTopics': stringUniqueAllTopics, 'documentsPerTopic': documentsPerTopic, 'n_top_documents': n_top_documents, 'topicsTableHTML': topicsTableHTML, 'tsneData': tsneData, }) return mainHTML
def mainMenu(): # ascii_banner = pyfiglet.figlet_format("Welcome to the assignment database") # print(ascii_banner) custom_figlet = Figlet(width=250) print(custom_figlet.renderText("Welcome to the assignment database")) while(True): print() # formatting print("You have the following options:\n\ 1: Display all assignments currently in the database\n\ 2: Display records from the other tables\n\ 3: Filter assignments by certain parameters (date due, instructor, class, department, etc.)\n\ 4: Add in a new record\n\ 5: Delete a record\n\ 6: Create a separate table with specified information\n\ 7: Generate table to CSV file\n\ 8: Undo all previous changes\n\ 9: Commit all current changes (NOTE: ONCE YOU SELECT THIS OPTION ALL CURRENT CHANGES WILL BE WRITTEN TO THE DB AND THE ACTION CANNOT BE UNDONE") print() # formatting userOption = input("Please input the number associated with the option you would like to perform (type \"Q\" or \"q\" to exit the program: ") if((userOption) == 'Q' or userOption == 'q'): print() #formatting exit_banner = pyfiglet.figlet_format("Exiting") print(exit_banner) break if(userOption.isnumeric()): userOption = int(userOption) else: print() # formatting invalidInputFiglet = Figlet(width=250) print(invalidInputFiglet.renderText("Invalid input. Please type a value from 1 - 9")) if(userOption == 1): EstablishConnection.printAllAssignments() # print out all of the assignments currently in the database elif(userOption == 2): repeat = True while(repeat): tableOutput = int(input("From which table would you like to print the records: \n\ 1: Faculty\n\ 2: Courses\n\ 3: Department\n\ 4: Schools\n\ Option: ")) if(tableOutput == 1): EstablishConnection.displayFacultyData() repeat = False elif(tableOutput == 2): EstablishConnection.displayCourseData() repeat = False elif(tableOutput == 3): EstablishConnection.displayDepartmentData() repeat = False elif(tableOutput == 4): EstablishConnection.displaySchoolData() repeat = False else: print("Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records") elif(userOption == 3): detailedView = input("Would you like to see a detailed view of the assignments (as in instructor of the course, course name, etc.? \n\ Typing \'N\' will only print out the Assignment and its related details rather than a detailed list including instructor details, course details, etc: ") if(detailedView == 'Y' or detailedView == 'y'): repeat = True while (repeat): parameterChosen = int(input("Which details would you like to print/output: \n\ 1: Assignments, Courses, and Department\n\ 2: Assignments, Courses, Faculty\n\ 3: Assignments, Courses, Department, and Specific College\n\ 4: Assignments and Courses Only \n\ Option: ")) if (parameterChosen == 1): departmentName = input("Name of Department (case sensitive): ") DBQueries.printAssignmentsCoursesDepartmentInfo(departmentName) repeat = False elif (parameterChosen == 2): facultyName = input("Name of faculty member (FirstName LastName with a space): ") DBQueries.printAssignmentsCoursesFacultyInfo(facultyName) repeat = False elif (parameterChosen == 3): collegeName = input("Name of college from which you would like to filter the assignments outstanding: ") # InsertValues.filterAssignmentsOutstandingByDepartment(departmentName) DBQueries.printAssignmentsCoursesDepartmentCollegeInfo(collegeName) repeat = False elif (parameterChosen == 4): courseName = input("Name of course from which you would like to filter the assignments outstanding: ") DBQueries.printAssignmentsCoursesInfo(courseName) repeat = False else: print( "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records") else: repeat = True while(repeat): parameterChosen = int(input("By which parameter would you like to filter the assignments outstanding: \n\ 1: Faculty\n\ 2: Courses\n\ 3: Department\n\ 4: Schools \n\ Option: ")) if (parameterChosen == 1): facultyName = input("Name of faculty member: MUST BE in FirstName LastName format (with a space) ") DBQueries.filterOutstandingAssignmentsByFaculty(facultyName) repeat = False elif (parameterChosen == 2): courseName = input("Name of course: ") DBQueries.filterOutstandingAssignmentsByCourse(courseName) repeat = False elif (parameterChosen == 3): departmentName = input("Name of department: ") # InsertValues.filterAssignmentsOutstandingByDepartment(departmentName) DBQueries.filterOutstandingAssignmentsByDepartment(departmentName) repeat = False elif (parameterChosen == 4): schoolName = input("School/College Name: ") DBQueries.filterOutstandingAssignmentsBySchool(schoolName) repeat = False else: print( "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records") elif(userOption == 4): repeat = True while (repeat): tableToAdd = int(input("Please input the option of the table to which you wish to add: \n\ 1: Assignments\n\ 2: Faculty\n\ 3: Courses\n\ 4: Department\n\ 5: Schools\n\ Option: ")) if (tableToAdd == 1): InsertDeleteValues.theAssignmentDetails() repeat = False elif (tableToAdd == 2): InsertDeleteValues.theFacultyDetails() repeat = False elif (tableToAdd == 3): InsertDeleteValues.theCourseDetails() repeat = False elif (tableToAdd == 4): InsertDeleteValues.theDepartmentDetails() repeat = False elif (tableToAdd == 5): InsertDeleteValues.universitySchoolsDetails() repeat = False else: print("Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records") elif(userOption == 5): repeat = True while (repeat): tableToAdd = int(input("Please input the option of the table to which you wish to delete a record: \n\ 1: Assignment\n\ 2: Faculty\n\ 3: Courses\n\ 4: Department\n\ 5: Schools\n\ Option: ")) if (tableToAdd == 1): InsertDeleteValues.deleteFromAssignments() repeat = False elif (tableToAdd == 2): InsertDeleteValues.deleteFromFaculty() repeat = False elif (tableToAdd == 3): InsertDeleteValues.deleteFromCourses() repeat = False elif (tableToAdd == 4): InsertDeleteValues.deleteFromDepartment() repeat = False elif (tableToAdd == 5): InsertDeleteValues.deleteFromUniversitySchools() else: print( "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records") elif(userOption == 6): repeat = True while(repeat): specifics = input("From which tables would you like to pull information: \n\ 1: Assignments and Courses\n\ 2: Assignments and Faculty\n\ 3: Assignments and Department\n\ 4: Assignments and College\n\ Option: ") if(int(specifics) == 1): CreateViews.CreateViewAssignmentsCourses() repeat = False elif(int(specifics) == 2): CreateViews.CreateViewAssignmentsFaculty() repeat = False elif(int(specifics) == 3): CreateViews.CreateViewAssignmentsDepartment() repeat = False elif(int(specifics) == 4): CreateViews.CreateViewAssignmentsCollege() repeat = False else: print("Invalid input. Please type a value from 1 - 4") elif(userOption == 7): repeat = True while (repeat): selection = input("Which table would you like to have a report of? Type \"exit\" to quit (Assignments, Courses, Departments, Faculty, Schools)") if selection.lower() == "assignments": EstablishConnection.returnAssignments() elif selection.lower() == "courses": EstablishConnection.returnCourses() elif selection.lower() == "departments": EstablishConnection.returnDepartment() elif selection.lower() == "faculty": EstablishConnection.returnFaculty() elif selection.lower() == "schools": EstablishConnection.returnSchool() elif selection.lower() == "exit": repeat = False else: print("Invalid input. Type \'exit\' to quit.") elif(userOption == 8): InsertDeleteValues.RollbackAction() # Rollback all of the actions the user has taken print() print("Changes rolled back") elif(userOption == 9): InsertDeleteValues.CommitAction() # Commit all of the actions the user has taken print() print("Changes committed")
def assignee_grouping(request, data_set): form = None errors = [] warnings = [] valid = False inputFile = None referenceTableHTML = None assigneeGroupingHTML = None assigneeGroupingSummaryHTML = None sampleFileHTML = None dataSetNames = [] outFileName = 'Assignee Grouping' outFolderName = '../out/' fileType = '.xlsx' sheetName = 'Sheet1' datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') if(not data_set == 'index'): df = pd.DataFrame() df = dbq.getDataSetPatentAssignees(data_set, df) keywordsDF = dbq.getAllAssigneeKeywords() df = pp.updateAssigneeGrouping(df, keywordsDF) dbq.updateCleanCurrentAssignees(data_set, df) df.to_excel(outFolderName + outFileName + fileType) assigneeGroupingHTML = df.head(10).to_html(index=False) valid = True # for CAs in df[targetColumnName].tolist(): # if(CAs == CAs): # tempCAList = [] # CAList = str(CAs).splitlines() # for CA in CAList: # group = pp.getAssgineeGroup(CA, referenceDF) # if(CA.lower().lstrip().rstrip() != group.lower().lstrip().rstrip()): # allCounts[allGroups.index(group)] = allCounts[allGroups.index(group)] + 1 # tempCAList.append(group) # tempCAListString = str(tempCAList).lstrip().rstrip().replace("'", '') # newCAs.append(tempCAListString) # else: # newCAs.append(None) # df['Assignee Group'] = newCAs # referenceDF['Counts'] = allCounts # df.to_excel(outFolderName + outFileName + fileType) # assigneeGroupingHTML = df.to_html(index=False) # referenceDF = referenceDF.drop(columns = ['Contains', 'Does Not Contain']) # assigneeGroupingSummaryHTML = referenceDF.to_html(index=False) templateHTML = 'parola_refine/assignee_grouping.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'errors': errors, 'warnings': warnings, 'data_set': data_set, 'dataSetNames': dataSetNames, 'assigneeGroupingHTML': assigneeGroupingHTML, }) return mainHTML
def cluster_map(request, data_set, classification1, classification2): errors = [] warnings = [] form = None valid = False hasDataSet = False clusterData = None dataSetNames = [] selectedClassificationDisplay = '' selectedClass = '' minNodeSize = 99999 maxNodeSize = 0 minEdgeWeight = 99999 maxEdgeWeight = 0 minNodeSizeWithLabel = 20 maxNNodes = 20 topN = 10 previousNYears = 20 targetCPCColumnName = 'CPCs' outFileName = 'clusterMapInput' outFolderName = '../templates/visualization/' fileType = '.json' dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') classificationNames = dbq.getClassificationList() classificationNames.insert(0, 'index') # Model setup view if (not (data_set == 'index' or classification1 == 'index' or classification2 == 'index' or classification1 == classification2)): # df = dbq.getDataSetPatents(data_set) # if(len(df.index)>1000): # df = df.sample(n=500, replace=False, random_state=17)] df = pd.DataFrame() df = dbq.getDataSetPatentColumn(data_set, df, classification1) selectedClass1 = df[classification1].tolist() df = dbq.getDataSetPatentColumn(data_set, df, classification2) selectedClass2 = df[classification2].tolist() df = dbq.getDataSetPatentYears(data_set, df) years = df['Years'] maxYear = max(years) minYear = maxYear - previousNYears + 1 df = df[df.Years >= minYear] allCategories = [] uniqueCategories1 = [] uniqueCategories2 = [] combinedCategories = [] allClass1 = [] allClass2 = [] for cList1 in selectedClass1: if (cList1 == cList1 and cList1 != None): for c in cList1: if (c != 'nan' and c != '' and c != 'NAN' and c != 'Nan'): allClass1.append(c) for cList2 in selectedClass2: if (cList2 == cList2 and cList2 != None): for c in cList2: if (c != 'nan' and c != '' and c != 'NAN' and c != 'Nan'): allClass2.append(c) expandedDF1 = pd.DataFrame() expandedDF1[classification1] = allClass1 expandedDF2 = pd.DataFrame() expandedDF2[classification2] = allClass2 grouped = expandedDF1.groupby([classification1 ]).size().reset_index(name='nPPA') topNClassification1 = grouped.nlargest( topN, 'nPPA')[classification1].tolist() grouped = expandedDF2.groupby([classification2 ]).size().reset_index(name='nPPA') topNClassification2 = grouped.nlargest( topN, 'nPPA')[classification2].tolist() # for c2, c3, c4 in zip(categories2, categories3, categories4): for c1, c2 in zip(selectedClass1, selectedClass2): if (not c1): c1 = [] if (not c2): c2 = [] c1 = [c for c in c1 if c in topNClassification1] c2 = [c for c in c2 if c in topNClassification2] allCategories = allCategories + c1 + c2 combinedCategories.append(c1 + c2) uniqueCategories1 = uniqueCategories1 + c1 uniqueCategories2 = uniqueCategories2 + c2 uniqueCategories1 = list(set(uniqueCategories1)) uniqueCategories2 = list(set(uniqueCategories2)) # expanded = pd.DataFrame() # expanded['Categories'] = allCategories # categorySizes = expanded.groupby(['Categories']).size().reset_index(name='nPPA') # categoryList = categorySizes['Categories'].tolist() # categorySizesList = categorySizes['nPPA'].tolist() selectedClass = combinedCategories # selectedClass = categoryByKeywords # wordList = [] # f = open('../out/words.txt', 'r') # for line in f: # wordList.append(line.rstrip()) # f.close() # titleWords = pp.normalizeCorpus(df['Titles'].tolist(), wordList) allClass = [] for c in selectedClass: if (c): allClass = allClass + list(filter(lambda a: a != '', c)) expanded = pd.DataFrame() expanded['Class'] = allClass classSizes = expanded.groupby(['Class' ]).size().reset_index(name='nPPA') classList = classSizes['Class'].tolist() classSizesList = classSizes['nPPA'].tolist() grouped = expanded.groupby( ['Class']).size().reset_index(name='Number of P/PA') topNClass = grouped.nlargest(10, 'Number of P/PA')['Class'].tolist() # Cleaning of CPC relationships = selectedClass relationshipsEval = [] if (maxNNodes > 0): topNNodes = v.getTopNNodes(relationships, maxNNodes) for rList in relationships: tempRList = [] for node in list(filter(lambda a: a != '', rList)): if (node in topNNodes): tempRList.append(node) relationshipsEval.append(tempRList) else: for rList in relationships: relationshipsEval.append(list(filter(lambda a: a != '', rList))) source = [] target = [] weight = [] for r in relationshipsEval: pairs = combinations(r, 2) for p in pairs: if ((p[0] in uniqueCategories1 and p[1] in uniqueCategories1) or (p[0] in uniqueCategories2 and p[1] in uniqueCategories2)): continue else: source.append(p[0]) target.append(p[1]) weight.append(1) # source.append(p[0]) # target.append(p[1]) # weight.append(1) newDF = pd.DataFrame() newDF['source'] = source newDF['target'] = target newDF['weight'] = weight graphDF = newDF.groupby(['source', 'target']).sum().reset_index() maxEdgeWeight = graphDF['weight'].max() minEdgeWeight = graphDF['weight'].min() # graphDF.to_excel(outFolderName + 'edgelist.xlsx') G = nx.from_pandas_edgelist(graphDF, 'source', 'target', 'weight') G2 = nx.convert_node_labels_to_integers(G, label_attribute='name') # Determine node groups using Louvain modularity # communities = best_partition(G2, weight='size') d = nx.readwrite.json_graph.node_link_data(G2, {'name': 'index'}) nodeNames = [] nodeCommunities = [] nodeSizes = [] nodeTop10 = [] for node in d['nodes']: name = node['name'] # size = G2.degree[list(G.nodes()).index(node['name'])] size = classSizesList[classList.index(node['name'])] community = 2 if (name in uniqueCategories1): community = 0 if (name in uniqueCategories2): community = 1 # community = communities[list(G.nodes()).index(node['name'])] node['size'] = size node['group'] = community nodeNames.append(name) nodeSizes.append(size) nodeCommunities.append(community) name = node['name'] if (node['size'] < minNodeSize): minNodeSize = node['size'] if (node['size'] > maxNodeSize): maxNodeSize = node['size'] # minNodeSizeWithLabel = 0.2 * maxNodeSize # for node in d['nodes']: # if(node['size'] < minNodeSizeWithLabel): # node['name'] = None for node in d['nodes']: if (not node['name'] in topNClass): node['fontSize'] = 8 node['opacity'] = 0.5 else: node['fontSize'] = node['size'] node['opacity'] = 1 nodesDF = pd.DataFrame() nodesDF['CPC'] = nodeNames nodesDF['Size'] = nodeSizes nodesDF['Community'] = nodeCommunities del d["directed"] del d["multigraph"] del d["graph"] clusterData = d hasDataSet = True valid = True templateHTML = 'visualization/cluster_map.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'errors': errors, 'warnings': warnings, 'data_set': data_set, 'classification1': classification1, 'classification2': classification2, 'classificationNames': classificationNames, 'selectedClassificationDisplay': selectedClassificationDisplay, 'hasDataSet': hasDataSet, 'dataSetNames': dataSetNames, 'minNodeSize': minNodeSize, 'maxNodeSize': maxNodeSize, 'maxEdgeWeight': maxEdgeWeight, 'minEdgeWeight': minEdgeWeight, 'clusterData': clusterData, 'previousNYears': previousNYears, }) return mainHTML
def dataset_statistics(request, data_set, classification): nPPA = None CPCLegend = None selectedClassificationDisplay = '' selectedClass = '' minCount = 1 maxCount = 15 minCount2 = 1 maxCount2 = 15 minCount3 = 1 maxCount3 = 15 targetN = 12 previousNYears = 20 yearData = None areaData = None categoryData = None categoryPercentData = None categoryYearData = None assigneeData = None assigneeYearData = None categoryAssigneeData = None sunBurstData = None smallMultipleData = None dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') classificationNames = dbq.getClassificationList() classificationNames.insert(0, 'index') if (data_set != 'index'): if (request.method == "POST"): targetN = int(request.POST.get('target-n')) previousNYears = int(request.POST.get('target-n-years')) # df = dbq.getDataSetPatents(data_set) df = pd.DataFrame() df = dbq.getDataSetPatentYears(data_set, df) df = dbq.getDataSetPatentAssignees(data_set, df) assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry() df['Current Assignees'] = df['Clean Assignees'].tolist() df = dbq.getDataSetPatentTypes(data_set, df) df = dbq.getDataSetPatentColumn(data_set, df, classification) years = df['Years'] maxYear = max(years) minYear = maxYear - previousNYears + 1 df = df[df.Years >= minYear] df = df[df['Current Assignees'] != ''] years = df['Years'] CAs = df['Current Assignees'] types = df['Types'] selectedClass = df[classification].tolist() nPPA = len(df.index) topNAssignees = pd.crosstab( df['Current Assignees'], [df['Types']], margins=True).sort_values( by=['All'], ascending=False).reset_index().drop(['All'], axis=1).drop( [0]).head(targetN)['Current Assignees'].tolist() dfCopy = df.copy() smallMultipleData = dfCopy.groupby([ 'Years', 'Current Assignees' ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA') smallMultipleData = smallMultipleData[ smallMultipleData['Current Assignees'].isin(topNAssignees)] smallMultipleData = smallMultipleData.rename( index=str, columns={'Current Assignees': "CurrentAssignees"}) smallMultipleData = smallMultipleData.to_json(orient="index") dfCopy = df.copy() dataSetSource = Datasets.objects.filter(name=data_set)[0].source dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries, dataSetSource) dfCopy = dfCopy.dropna(subset=['Sectors']) dfCopy = dfCopy.dropna(subset=['Industries']) dfCopy = dfCopy[['Sectors', 'Industries', 'ids']] sectorsIndustriesCounts = dfCopy.groupby( ['Sectors', 'Industries']).size().reset_index(name='nPPA') d = dict() d = {"name": "flare", "children": []} for line in sectorsIndustriesCounts.values: the_parent = line[0] the_child = line[1] child_size = line[2] keys_list = [] for item in d['children']: keys_list.append(item['name']) if not the_parent in keys_list: d['children'].append({ "name": the_parent, "children": [{ "name": the_child, "size": child_size }] }) else: d['children'][keys_list.index(the_parent)]['children'].append({ "name": the_child, "size": child_size }) sunBurstData = d allYears = [] allCAs = [] allClass = [] allTypes = [] for cList, year, CA, patentType in zip(selectedClass, years, CAs, types): if (cList == cList and cList != None): for c in cList: if (c != 'nan' and c != '' and c != 'NAN'): allClass.append(c) allYears.append(year) allCAs.append(CA) allTypes.append(patentType) expandedDF = pd.DataFrame() expandedDF['Categories'] = allClass expandedDF['Years'] = allYears expandedDF['Current Assignees'] = allCAs expandedDF['Types'] = allTypes # Number of PPA per year for line graph yearData = cim.getGroupByInput(df, 'Years', 'nPPA') # Number of PPA per year for area chart # areaData = cim.getGroupByInput2(df, 'Years', 'Types', 'nPPA') counts = df.groupby([ 'Years', 'Types' ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA') areaData = counts.to_json(orient="index") # top 10 category counts categoryData = cim.getCrossTabInput(expandedDF, 'Categories', 'Types', targetN) categoryCounts = pd.crosstab(expandedDF['Categories'], [expandedDF['Types']], margins=True).sort_values( 'All', ascending=False).reset_index() categoryCounts = categoryCounts.drop(['All'], axis=1).drop([0]).head(targetN) if (classification == 'cpc'): CPCLegend = zip( categoryCounts['Categories'].tolist(), pp.getCPCDescription(categoryCounts['Categories'].tolist())) categoryPercentages = expandedDF.groupby( ['Categories']).size().reset_index(name='percent') total = categoryPercentages['percent'].sum() categoryPercentages[ 'percent'] = categoryPercentages['percent'] / total * 100 categoryPercentages['percent'] = categoryPercentages['percent'].round( decimals=1) categoryPercentages = categoryPercentages.head(targetN) categoryPercentData = categoryPercentages.to_json(orient="index") # top 10 categories and their PPA counts per year grouped = expandedDF.groupby( ['Categories']).size().reset_index(name='Number of P/PA') topNCategories = grouped.nlargest( targetN, 'Number of P/PA')['Categories'].tolist() expandedDF = expandedDF[expandedDF['Categories'].isin(topNCategories)] expandedDF = expandedDF.sort_values(['Years', 'Categories'], ascending=False) hm = expandedDF.groupby(['Years', 'Categories']).size().reset_index(name='nPPA') hm = hm.sort_values(['Years'], ascending=True) hm = hm.sort_values(['nPPA'], ascending=False) hm = hm[hm.Years != 0] maxCount = hm['nPPA'].max() minCount = hm['nPPA'].min() categoryYearData = hm.to_json(orient="index") # top assignees assigneeData = cim.getCrossTabInput(df, 'Current Assignees', 'Types', targetN) # grouped = df.groupby(['Current Assignees']).size().reset_index(name='Number of P/PA') # topNAssignees = grouped.nlargest(targetN, 'Number of P/PA')['Current Assignees'].tolist() # top 10 assignees and their PPA counts per year dfCopy = df.copy() dfCopy = dfCopy[dfCopy['Current Assignees'].isin(topNAssignees)] hm = dfCopy.groupby(['Years', 'Current Assignees' ]).size().reset_index(name='nPPA') hm = hm[hm.Years != 0] maxCount2 = hm['nPPA'].max() minCount2 = hm['nPPA'].min() hm = hm.rename(index=str, columns={'Current Assignees': "Categories"}) tempDF = pd.DataFrame(columns=['Years', 'Categories', 'nPPA']) tempDF['Categories'] = topNAssignees tempDF['Years'] = 9999 tempDF['nPPA'] = 0 tempDF = tempDF.append(hm).reset_index() assigneeYearData = tempDF.to_json(orient="index") # top 10 categories and top 10 assignees expandedDF = expandedDF[expandedDF['Current Assignees'].isin( topNAssignees)] hm = expandedDF.groupby(['Categories', 'Current Assignees' ]).size().reset_index(name='nPPA') # hm = hm.sort_values(['CurrentAssignees'], ascending=True) hm = hm.sort_values(['nPPA'], ascending=False) hm = hm.rename(index=str, columns={'Current Assignees': "CurrentAssignees"}) tempDF = pd.DataFrame( columns=['Categories', 'CurrentAssignees', 'nPPA']) tempDF['CurrentAssignees'] = topNAssignees tempDF['Categories'] = "9999" tempDF['nPPA'] = 0 tempDF = tempDF.append(hm).reset_index() categoryAssigneeData = tempDF.to_json(orient="index") maxCount3 = hm['nPPA'].max() minCount3 = hm['nPPA'].min() templateHTML = 'visualization/dataset_statistics.html' mainHTML = render_to_string( templateHTML, { 'data_set': data_set, 'classification': classification, 'selectedClassificationDisplay': selectedClassificationDisplay, 'dataSetNames': dataSetNames, 'classificationNames': classificationNames, 'minCount': minCount, 'maxCount': maxCount, 'minCount2': minCount2, 'maxCount2': maxCount2, 'minCount3': minCount3, 'maxCount3': maxCount3, 'nPPA': nPPA, 'CPCLegend': CPCLegend, 'targetN': targetN, 'previousNYears': previousNYears, 'yearData': yearData, 'areaData': areaData, 'categoryData': categoryData, 'categoryPercentData': categoryPercentData, 'categoryYearData': categoryYearData, 'assigneeData': assigneeData, 'assigneeYearData': assigneeYearData, 'categoryAssigneeData': categoryAssigneeData, 'sunBurstData': sunBurstData, 'smallMultipleData': smallMultipleData }) return mainHTML
def listlanguages(): reponame = bleach.clean(request.args['a']).strip() #TODO: Handle empty reponame Languages = DBQueries.LanguageBreakdown(reponame) return jsonify(languages=Languages)
def general_trends(request, date): dates = [] previousDate = None yearData = None targetN = 10 if (request.method == "POST"): targetN = int(request.POST.get('target-n')) df = pd.DataFrame() df = dbq.getDataSetPatentsBySource(df, 'uspto') assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry() for d in list(set(df['Dates'].tolist())): dates.append(str(d)) dates.sort(reverse=True) date = dates[0] previousDate = dates[1] # Number of PPA per week for line graph yearData = cim.getGroupByInput(df, 'Dates', 'nPPA') # Assignee data preparation dfCopy = df.copy() dfCopy = dfCopy[dfCopy['Dates'].isin([date, previousDate])] dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries, 'uspto') dfCopy = dfCopy.dropna(subset=['Sectors']) dfCopy['Current Assignees'] = dfCopy['Clean Assignees'] dfCopy = dfCopy[['Dates', 'Current Assignees', 'ids']] assignees = list(set(dfCopy['Current Assignees'].tolist())) counts = dfCopy.groupby([ 'Dates', 'Current Assignees' ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA') countBefore = counts['nPPA'].tolist()[:len(assignees)] countAfter = counts['nPPA'].tolist()[len(assignees):] differences = counts.groupby(['Current Assignees']).diff()[len(assignees):] totals = counts.groupby(['Current Assignees']).sum() assigneeData = pd.DataFrame() assigneeData['Current Assignees'] = assignees assigneeData = assigneeData.sort_values(by='Current Assignees') assigneeData['Before'] = countBefore assigneeData['After'] = countAfter assigneeData['Total'] = totals['nPPA'].tolist() assigneeData['Change'] = differences['nPPA'].tolist() assigneeData[ 'PercentChange'] = assigneeData['Change'] / assigneeData['Total'] * 100 assigneeTopLosersData = assigneeData.sort_values(by='Change').head(targetN) assigneeData = assigneeData.sort_values(by='Change', ascending=False).head(targetN) assigneeLosersChanges = zip(assigneeTopLosersData['Current Assignees'], assigneeTopLosersData['Change'], assigneeTopLosersData['PercentChange']) assigneeChanges = zip(assigneeData['Current Assignees'], assigneeData['Change'], assigneeData['PercentChange']) # Sector data preparation dfCopy = df.copy() dfCopy = dfCopy[dfCopy['Dates'].isin([date, previousDate])] dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries, 'uspto') dfCopy = dfCopy.dropna(subset=['Sectors']) dfCopy = dfCopy[['Dates', 'Sectors', 'ids']] sectors = list(set(dfCopy['Sectors'].tolist())) counts = dfCopy.groupby([ 'Dates', 'Sectors' ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA') countBefore = counts['nPPA'].tolist()[:len(sectors)] countAfter = counts['nPPA'].tolist()[len(sectors):] differences = counts.groupby(['Sectors']).diff()[len(sectors):] totals = counts.groupby(['Sectors']).sum() sectorData = pd.DataFrame() sectorData['Sectors'] = sectors sectorData = sectorData.sort_values(by='Sectors') sectorData['Before'] = countBefore sectorData['After'] = countAfter sectorData['Total'] = totals['nPPA'].tolist() sectorData['Change'] = differences['nPPA'].tolist() sectorData[ 'PercentChange'] = sectorData['Change'] / sectorData['Total'] * 100 sectorData = sectorData.sort_values(by='Change', ascending=False) sectorChanges = zip(sectorData['Sectors'], sectorData['Change'], sectorData['PercentChange']) templateHTML = 'uspto/general_trends.html' mainHTML = render_to_string( templateHTML, { 'date': date, 'dates': dates, 'yearData': yearData, 'sectorData': sectorData, 'sectorChanges': sectorChanges, 'assigneeChanges': assigneeChanges, 'assigneeLosersChanges': assigneeLosersChanges, 'targetN': targetN, }) return mainHTML
def new_data_setup(request): errors = [] warnings = [] form = None valid = False hasInputFile = False columnList = None dataSetName = None targetYearColumnName = None targetAssigneeColumnName = None sampleFileHTML = None inputFileHTML = None inFileName = 'newDataInput' inFolderName = '../in/' fileType = '.xlsx' # Submit File View Setup if (request.method == "POST" and request.POST.get('upload')): form = UploadFileForm(request.POST, request.FILES) inputFile = request.FILES['file'] if form.is_valid(): inputFile = request.FILES['file'] targetSheetName = request.POST.get('target-sheet') inputFileDF = pd.read_excel(inputFile, targetSheetName) columnList = list(inputFileDF.columns) columnList.append('Not Applicable') inputFileDF.to_excel(inFolderName + inFileName + fileType, index=False) inputFileHTML = inputFileDF.head(50).to_html() try: if (inputFile): inputFile.close() except Exception: valid = False errors.append( "The process cannot access the input file because it is being used by another process." ) hasInputFile = True # Step 2 View elif (request.method == "POST" and request.POST.get('finish')): dataSetName = request.POST.get('data-set-name') sourceName = request.POST.get('target-source') targetPublicationNumberColumnName = request.POST.get( 'target-column-publication-number') targetAssigneeColumnName = request.POST.get('target-column-assignee') targetYearColumnName = request.POST.get('target-column-year') targetMainCPCColumnName = request.POST.get('target-main-cpc') targetCPCColumnName = request.POST.get('target-column-cpc') targetCategoryColumnName = request.POST.get('target-column-category') targetTitlesColumnName = request.POST.get('target-column-titles') targetAbstractsColumnName = request.POST.get('target-column-abstracts') targetIndependentClaimsColumnName = request.POST.get( 'target-column-independent-claims') targetTechnicalConceptColumnName = request.POST.get( 'target-column-technical-concepts') df = pd.read_excel(inFolderName + inFileName + fileType, 'Sheet1') if (targetPublicationNumberColumnName == 'Not Applicable'): df['PUBLICATION NUMBER'] = '' else: df = df.rename(index=str, columns={ targetPublicationNumberColumnName: "PUBLICATION NUMBER" }) if (targetAssigneeColumnName == 'Not Applicable'): df['CA'] = '' else: df = df.rename(index=str, columns={targetAssigneeColumnName: "CA"}) if (targetYearColumnName == 'Not Applicable'): df['YEAR'] = '' else: df = df.rename(index=str, columns={targetYearColumnName: "YEAR"}) if (targetMainCPCColumnName == 'Not Applicable'): df['MAIN CPC'] = '' else: df = df.rename(index=str, columns={targetMainCPCColumnName: "MAIN CPC"}) if (targetCPCColumnName == 'Not Applicable'): df['CPC'] = '' else: df = df.rename(index=str, columns={targetCPCColumnName: "CPC"}) if (targetCategoryColumnName == 'Not Applicable'): df['CATEGORY'] = '' else: df = df.rename(index=str, columns={targetCategoryColumnName: "CATEGORY"}) if (targetTitlesColumnName == 'Not Applicable'): df['TITLES'] = '' else: df = df.rename(index=str, columns={targetTitlesColumnName: "TITLES"}) if (targetAbstractsColumnName == 'Not Applicable'): df['ABSTRACTS'] = '' else: df = df.rename(index=str, columns={targetAbstractsColumnName: "ABSTRACTS"}) if (targetIndependentClaimsColumnName == 'Not Applicable'): df['INDEPENDENT CLAIMS'] = '' else: df = df.rename(index=str, columns={ targetIndependentClaimsColumnName: "INDEPENDENT CLAIMS" }) if (targetTechnicalConceptColumnName == 'Not Applicable'): df['TECHNICAL CONCEPTS'] = '' else: df = df.rename(index=str, columns={ targetTechnicalConceptColumnName: "TECHNICAL CONCEPTS" }) # df['MAIN CPC DESCRIPTION'] = pp.getCPCDescriptions(df) # df['CPC DESCRIPTIONS'] = pp.getCPCListDescriptions(df) # temp = cc.removePublicationNumbers(df['TECHNICAL CONCEPTS'].tolist()) # temp = cc.removeConceptPostfix(temp) # temp = cc.getTechnicalConcepts(temp) # df['TECHNICAL CONCEPTS'] = temp df['TYPE'] = cc.getDocumentTypes(df['PUBLICATION NUMBER'], 9) keywordsDF = dbq.getAllAssigneeKeywords() df = pp.assigneeGrouping(df, keywordsDF) # dbq.updateCleanCurrentAssignees(dataSetName, df) dbq.insertPatents(df, dataSetName, sourceName) hasInputFile = True valid = True dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.sort() templateHTML = 'data_sets/index.html' mainHTML = render_to_string(templateHTML, { 'dataSetNames': dataSetNames, }) return mainHTML # Default View else: form = UploadFileForm() sampleFileDF = pd.read_excel('../out/Small Sample File.xlsx', 'Sheet1') sampleFileHTML = sampleFileDF.head().to_html() templateHTML = 'data_sets/new_data_setup.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'hasInputFile': hasInputFile, 'errors': errors, 'warnings': warnings, 'columnList': columnList, 'sampleFileHTML': sampleFileHTML, 'inputFileHTML': inputFileHTML, }) return mainHTML
def assignee_statistics(request, data_set, classification, assignee): errors = [] warnings = [] form = None valid = False hasDataSet = False hasAssignee = False nPPA = None CPCLegend = None assigneeList = None selectedClass = '' selectedClassificationDisplay = '' maxYear = None categoryData = None yearData = None previousNYears = 20 targetAssigneeColumnName = "CA" targetYearColumnName = "YEAR" targetCategoryColumnName = "Categories" # Data set selection view dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') classificationNames = dbq.getClassificationList() classificationNames.insert(0, 'index') # Category selection view if (not data_set == 'index' and assignee == 'index'): # df = dbq.getDataSetPatents(data_set) df = pd.DataFrame() df = dbq.getDataSetPatentAssignees(data_set, df) df['Current Assignees'] = df['Clean Assignees'].tolist() CAs = df['Current Assignees'] assigneeList = sorted(list(set(CAs.tolist()))) assigneeList.insert(0, 'index') hasDataSet = True # Graph preparations elif (not data_set == 'index' and not assignee == 'index'): if (request.method == "POST"): previousNYears = int(request.POST.get('target-n-years')) # df = dbq.getDataSetPatents(data_set) df = pd.DataFrame() df = dbq.getDataSetPatentYears(data_set, df) df = dbq.getDataSetPatentTypes(data_set, df) df = dbq.getDataSetPatentAssignees(data_set, df) assigneeList = sorted(list(set(df['Current Assignees'].tolist()))) df['Current Assignees'] = df['Clean Assignees'].tolist() df = dbq.getDataSetPatentColumn(data_set, df, classification) df.drop(df[df['Current Assignees'] != assignee].index, inplace=True) years = df['Years'] maxYear = max(years) minYear = maxYear - previousNYears + 1 df = df[df.Years >= minYear] years = df['Years'] nPPA = len(df.index) selectedClass = df[classification].tolist() CAs = df['Current Assignees'] types = df['Types'] allClass = [] allYears = [] allCAs = [] allTypes = [] for year, CA, cList, patentType in zip(years, CAs, selectedClass, types): if (cList == cList and cList != None): # for c in ast.literal_eval(cList): for c in cList: if (c and c == c and c != 'nan' and c != 'NAN'): allClass.append(c.lstrip().rstrip()) allYears.append(int(year)) allCAs.append(str(CA).lower().lstrip().rstrip()) allTypes.append(patentType) expandedDF = pd.DataFrame() expandedDF[targetCategoryColumnName] = allClass expandedDF[targetYearColumnName] = allYears expandedDF[targetAssigneeColumnName] = allCAs expandedDF['Types'] = allTypes # Line Graph Data expandedDFCopy = expandedDF.copy() grouped = df.groupby(['Years']) groupSizes = df.groupby(['Years']).size() years = [] sizes = [] for g, s in zip(grouped, groupSizes): years.append(int(g[0])) sizes.append(s) sizesDF = pd.DataFrame() sizesDF['Year'] = years sizesDF['Count'] = sizes maxYear = int(sizesDF.iloc[sizesDF['Count'].argmax()]['Year']) # Bar Graph 1 Data uniqueCategories = [] expandedDFCopy = expandedDF.copy() assigneeCounts = pd.crosstab(expandedDFCopy[targetCategoryColumnName], [expandedDFCopy['Types']], margins=True).sort_values( 'All', ascending=False).reset_index() assigneeCounts = assigneeCounts.drop(['All'], axis=1).drop([0]).head(10) uniqueCategories = assigneeCounts[targetCategoryColumnName].tolist() categoryData = assigneeCounts.to_json(orient="index") # Bar Graph 2 Data # Year Bar Graph Data yearData = pd.crosstab(df['Years'], [df['Types']]).reset_index() yearData = yearData.rename(index=str, columns={'Years': "Categories"}) yearData = yearData.to_json(orient="index") if (classification == 'cpc'): CPCLegend = zip(uniqueCategories, pp.getCPCDescription(uniqueCategories)) hasDataSet = True hasAssignee = True valid = True templateHTML = 'visualization/assignee_statistics.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'hasDataSet': hasDataSet, 'hasAssignee': hasAssignee, 'classification': classification, 'selectedClassificationDisplay': selectedClassificationDisplay, 'valid': valid, 'errors': errors, 'warnings': warnings, 'nPPA': nPPA, 'CPCLegend': CPCLegend, 'dataSetNames': dataSetNames, 'classificationNames': classificationNames, 'assigneeList': assigneeList, 'data_set': data_set, 'assignee': assignee, 'maxYear': maxYear, 'categoryData': categoryData, 'yearData': yearData, 'previousNYears': previousNYears, }) return mainHTML
# -*- coding: utf-8 -*- import DBQueries ########################## # Collect dataframe DBs # ########################## dbs = DBQueries.dictDB() dbs = dbs.sort_values(['DB', 'Date'], ascending=[True, False]) dbs = dbs.groupby('DB').first().reset_index() with open('data/current.ini', 'w') as the_file: the_file.write('[DBs]\n') the_file.write('number={}\n'.format(len(dbs))) the_file.write('[first_edition]\n') the_file.write('host=database\n') the_file.write('database=enzo\n') the_file.write('user=postgres\n') the_file.write('port=5432\n') for index, row in dbs.iterrows(): the_file.write('[{}]\n'.format(row['DB'])) the_file.write('host=10.9.0.50\n') the_file.write('database={}\n'.format(row['Hash'])) the_file.write('user=postgres\n') the_file.write('port=5432\n') the_file.write('password=xxxxxx\n') the_file.write('[ElasticSearch]\n') # the_file.write('server=elk.enzo.net\n') the_file.write('server=10.9.0.200\n') the_file.write('index_name=first_edition*\n')
def word_cluster_map(request, data_set, column): errors = [] warnings = [] form = None valid = False hasDataSet = False clusterData = None dataSetNames = [] selectedClassificationDisplay = '' minNodeSize = 99999 maxNodeSize = 0 minEdgeWeight = 99999 maxEdgeWeight = 0 minNodeSizeWithLabel = 20 maxNNodes = 30 previousNYears = 20 topN = 10 dataSetNames = [] datasets = Datasets.objects.all() for dataset in datasets: dataSetNames.append(dataset.name) dataSetNames.insert(0, 'index') columnNames = ['titles', 'abstracts', 'independent_claims'] columnNames.insert(0, 'index') # Model setup view if (not (data_set == 'index' or column == 'index')): if (request.method == "POST"): maxNNodes = int(request.POST.get('target-n-nodes')) previousNYears = int(request.POST.get('target-n-years')) # if(len(df.index)>1000): # df = df.sample(n=500, replace=False, random_state=17)] df = pd.DataFrame() df = dbq.getDataSetPatentTACs(data_set, df) df = dbq.getDataSetPatentYears(data_set, df) years = df['Years'] maxYear = max(years) minYear = maxYear - previousNYears + 1 df = df[df.Years >= minYear] wordList = [] f = open('../out/words.txt', 'r') for line in f: wordList.append(line.rstrip()) f.close() columnWords = [] if (column == 'titles'): columnWords = pp.normalizeCorpus(df['Titles'].tolist(), wordList) elif (column == 'abstracts'): columnWords = pp.normalizeCorpus(df['Abstracts'].tolist(), wordList) elif (column == 'independent_claims'): columnWords = pp.normalizeCorpus(df['Independent Claims'].tolist(), wordList) selectedColumn = columnWords uniqueWords = [] combinedWords = [] allWords = [] for wordList in selectedColumn: if (wordList == wordList and wordList != None): for word in wordList: if (word != 'nan' and word != '' and word != 'NAN' and word != 'Nan'): allWords.append(word) expandedDF = pd.DataFrame() expandedDF[column] = allWords uniqueWords = list(set(allWords)) wordSizes = expandedDF.groupby([column ]).size().reset_index(name='nPPA') topNWords = wordSizes.nlargest(topN, 'nPPA')[column].tolist() wordList = wordSizes[column].tolist() wordSizesList = wordSizes['nPPA'].tolist() # Cleaning of CPC relationships = selectedColumn relationshipsEval = [] if (maxNNodes > 0): topNNodes = v.getTopNNodes(relationships, maxNNodes) for rList in relationships: tempRList = [] for node in list(filter(lambda a: a != '', rList)): if (node in topNNodes): tempRList.append(node) relationshipsEval.append(tempRList) else: for rList in relationships: relationshipsEval.append(list(filter(lambda a: a != '', rList))) source = [] target = [] weight = [] for r in relationshipsEval: pairs = combinations(r, 2) for p in pairs: source.append(p[0]) target.append(p[1]) weight.append(1) newDF = pd.DataFrame() newDF['source'] = source newDF['target'] = target newDF['weight'] = weight graphDF = newDF.groupby(['source', 'target']).sum().reset_index() maxEdgeWeight = graphDF['weight'].max() minEdgeWeight = graphDF['weight'].min() # graphDF.to_excel(outFolderName + 'edgelist.xlsx') G = nx.from_pandas_edgelist(graphDF, 'source', 'target', 'weight') G2 = nx.convert_node_labels_to_integers(G, label_attribute='name') # Determine node groups using Louvain modularity communities = best_partition(G2, weight='size') d = nx.readwrite.json_graph.node_link_data(G2, {'name': 'index'}) nodeNames = [] nodeCommunities = [] nodeSizes = [] nodeTop10 = [] for node in d['nodes']: name = node['name'] # size = G2.degree[list(G.nodes()).index(node['name'])] size = wordSizesList[wordList.index(node['name'])] community = communities[list(G.nodes()).index(node['name'])] node['size'] = size node['group'] = community nodeNames.append(name) nodeSizes.append(size) nodeCommunities.append(community) name = node['name'] if (node['size'] < minNodeSize): minNodeSize = node['size'] if (node['size'] > maxNodeSize): maxNodeSize = node['size'] # minNodeSizeWithLabel = 0.2 * maxNodeSize # for node in d['nodes']: # if(node['size'] < minNodeSizeWithLabel): # node['name'] = None for node in d['nodes']: if (not node['name'] in topNWords): node['fontSize'] = 8 node['opacity'] = 0.5 else: node['fontSize'] = node['size'] node['opacity'] = 1 nodesDF = pd.DataFrame() nodesDF['CPC'] = nodeNames nodesDF['Size'] = nodeSizes nodesDF['Community'] = nodeCommunities del d["directed"] del d["multigraph"] del d["graph"] clusterData = d hasDataSet = True valid = True templateHTML = 'visualization/word_cluster_map.html' mainHTML = render_to_string( templateHTML, { 'form': form, 'valid': valid, 'errors': errors, 'warnings': warnings, 'data_set': data_set, 'column': column, 'columnNames': columnNames, 'hasDataSet': hasDataSet, 'dataSetNames': dataSetNames, 'minNodeSize': minNodeSize, 'maxNodeSize': maxNodeSize, 'maxEdgeWeight': maxEdgeWeight, 'minEdgeWeight': minEdgeWeight, 'clusterData': clusterData, 'maxNNodes': maxNNodes, 'previousNYears': previousNYears, }) return mainHTML
import re import HTMLGenerator import logging import DBQueries as DBQ import QueryExecution as Q queryObj=DBQ.DBQuery_Extractor() QueryDict={} QueryDict=queryObj.fetch_Query('Student_monthly_report.sql') identifierList=QueryDict.keys() identifierList.sort() TaskQueries=[] for i in identifierList: if re.search("TASK",i): TaskQueries.append(i) else: Q.DBQueryExecution(QueryDict[i]) monthdict={ 'January':"'%-01-%'", 'February':"'%-02-%'", 'March':"'%-03-%'", 'April':"'%-04-%'", 'May':"'%-05-%'", 'June':"'%-06-%'", 'July':"'%-07-%'", 'August':"'%-08-%'", 'September':"'%-09-%'", 'October':"'%-10-%'", 'November':"'%-11-%'", 'December':"'%-12-%'"