Python DBQueries 예제들, DBQueries Python 예제들

예제 #1

0

파일 보기

def reference_update(request):
    errors = []
    warnings = []
    valid = False
    inputFile = None
    referenceTableHTML = None
    sampleFileHTML = None
    
    outFolderName = '../out/'
    fileType = '.xlsx'
    sheetName = 'Sheet1'
    
    # Submit File View Setup
    if(request.method == "POST" and request.POST.get('upload')): 
        form = UploadFileForm(request.POST, request.FILES)
        inputFile = request.FILES['file']
        if form.is_valid():
            inputFile = request.FILES['file']
            targetSheetName = request.POST.get('target-sheet')
            inputFileDF = pd.read_excel(inputFile, targetSheetName)
            dbq.insertAssigneeKeywords(inputFileDF)
        try:       
            if(inputFile):
                inputFile.close()
        except Exception:
            valid = False
            errors.append("The process cannot access the input file because it is being used by another process.")
        valid = True

        templateHTML = 'parola_refine/index.html'
        mainHTML = render_to_string(
            templateHTML, {
        })
        return mainHTML

    # Submit File View
    elif(request.method == "POST" and request.POST.get('process')): 
        targetColumnName = request.POST.get('target-column')

    # Default View
    else:
        form = UploadFileForm()

    templateHTML = 'parola_refine/reference_update.html'
    mainHTML = render_to_string(
        templateHTML, {
        'form': form,
        'valid': valid,
        'errors': errors,
        'warnings': warnings,
        'inputFile': inputFile,
        'sampleFileHTML': sampleFileHTML,
    })
    return mainHTML

예제 #2

0

파일 보기

def index(request):
    df = pd.read_excel('../out/Assignee Sectors and Industries.xlsx', 'Sheet1')
    dbq.updateAssigneesRankSectorIndustry(df)
    # dbq.insertCPCDescriptions()
    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.sort()

    templateHTML = 'data_sets/index.html'
    mainHTML = render_to_string(templateHTML, {
        'dataSetNames': dataSetNames,
    })
    return mainHTML

예제 #3

0

파일 보기

파일: RunFlask.py 프로젝트: willemneal/githubanalytics

def index():
    query = ""
    processed_text1 = ""
    response2 = ""
    resultheading = ""
    #Debug
    #time.sleep(5)
    page, per_page, offset = get_page_items()
    total = 0
    pagination = get_pagination(
        page=page,
        per_page=per_page,
        total=total,
        format_total=True,
        format_number=True,
    )
    if request.method == 'GET':
        if 'q' in request.args:
            app.logger.debug("query from user ===> %s<===", request.args['q'])
            #Sanitize & Remove trailing space
            query = bleach.clean(request.args['q']).strip()
            app.logger.debug("query from user after bleach ===> %s<===", query)
            #Start: Uncomment to trigger slow response time
            #app.logger.debug ("sleeping .....")
            #time.sleep(15)
            #app.logger.debug ("awake .....")
            #End: Uncomment to trigger slow response time
            (total, resultheading, processed_text1,
             response2) = DBQueries.ProcessQuery(query, offset, per_page)
            pagination = get_pagination(
                page=page,
                per_page=per_page,
                total=total,
                format_total=True,
                format_number=True,
                record_name='repositories',
            )
            if (processed_text1 == "EMPTY"):
                t1 = Suggestions.compare("now") if (
                    query == "") else Suggestions.compare(query)
                processed_text1 = NORESULT + t1
    else:
        query = ""
        processed_text1 = ""
        response2 = ""
    return render_template("index-bootstrap.html",
                           page=page,
                           total=total,
                           per_page=per_page,
                           pagination=pagination,
                           title='Ask GitHub',
                           showGAcode=os.environ['showGAcode'],
                           appenv=os.environ['deployEnv'],
                           query=[{
                               "text": query
                           }],
                           resultheading=resultheading,
                           response2=response2,
                           processed_text=processed_text1)

예제 #4

0

파일 보기

def view_data_set(request, data_set):

    df = dbq.getDataSetPatents(data_set)
    dataHTML = df.head(50).to_html()
    return render(request, 'data_sets/view.html', {
        'data_set': data_set,
        'dataHTML': dataHTML
    })

예제 #5

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def assignee_trends(request, date):
    dates = []
    previousDate = None
    areaData = None
    targetN = 9

    if (request.method == "POST"):
        targetN = int(request.POST.get('target-n'))

    df = pd.DataFrame()
    df = dbq.getDataSetPatentsBySource(df, 'uspto')
    assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry()

    for d in list(set(df['Dates'].tolist())):
        dates.append(str(d))
    dates.sort(reverse=True)
    date = dates[0]
    previousDate = dates[1]

    # Area chart data preparation
    dfCopy = df.copy()
    dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries,
                                             'uspto')
    dfCopy = dfCopy.dropna(subset=['Sectors'])
    areaData = dfCopy.groupby([
        'Dates', 'Current Assignees'
    ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA')
    grouped = dfCopy.groupby(['Current Assignees'
                              ]).size().reset_index(name='nPPA')
    topNAssignees = grouped.nlargest(targetN,
                                     'nPPA')['Current Assignees'].tolist()
    areaData = areaData[areaData['Current Assignees'].isin(topNAssignees)]
    areaData = areaData.rename(
        index=str, columns={'Current Assignees': "CurrentAssignees"})
    areaData = areaData.to_json(orient="index")

    templateHTML = 'uspto/assignee_trends.html'
    mainHTML = render_to_string(templateHTML, {
        'date': date,
        'dates': dates,
        'areaData': areaData,
        'targetN': targetN,
    })
    return mainHTML

예제 #6

0

파일 보기

파일: RunFlask.py 프로젝트: willemneal/githubanalytics

def tsearch(name=None):
    query = bleach.clean(request.args['q']).strip()
    #print query
    #Minimum 5 characters for query
    if (len(query) <= 4):
        t = []  #return nothing!
    else:
        t = DBQueries.Typeahead(query)

    return make_response(dumps(t))

예제 #7

0

파일 보기

파일: PatentProcessing.py 프로젝트: jraramirez/patent-analytics-tool

def getCPCDescriptions(df):
    CPCDescriptions = []
    descriptions = dbq.getAllCPCDescription()
    for index, row in df.iterrows():
        descriptionsString = ''
        cpc = row['MAIN CPC']
        finalCPC = str(cpc)[0:4] + str(cpc)[5:str(cpc).find('/')].replace(
            '0', '') + str(cpc)[str(cpc).find('/') + 1:].replace('/', '')
        description = descriptions.loc[descriptions['cpc'] ==
                                       finalCPC]['description']
        if (len(description) > 0):
            descriptionsString = description.tolist()[0]
        CPCDescriptions.append(descriptionsString)
    return CPCDescriptions

예제 #8

0

파일 보기

파일: PatentProcessing.py 프로젝트: jraramirez/patent-analytics-tool

def getCPCDescription(CPCs):
    CPCDescriptions = []
    descriptions = dbq.getAllCPCDescription()
    for cpc in CPCs:
        descriptionsString = ''
        finalCPC = (
            str(cpc)[0:4] + str(cpc)[5:str(cpc).find('/')].replace('0', '') +
            str(cpc)[str(cpc).find('/') + 1:].replace('/', '')).upper()
        description = descriptions.loc[descriptions['cpc'] ==
                                       finalCPC]['description']
        if (len(description) > 0):
            descriptionsString = description.tolist()[0]
        CPCDescriptions.append(descriptionsString)
    return CPCDescriptions

예제 #9

0

파일 보기

파일: PatentProcessing.py 프로젝트: jraramirez/patent-analytics-tool

def getCPCListDescriptions(df):
    CPCDescriptions = []
    descriptions = dbq.getAllCPCDescription()
    for index, row in df.iterrows():
        descriptionsList = []
        descriptionsString = ''
        cpcList = str(row['CPC']).splitlines()
        for cpc in cpcList:
            finalCPC = cpc[0:4] + cpc[5:cpc.find('/')].replace(
                '0', '') + cpc[cpc.find('/') + 1:].replace('/', '')
            description = descriptions.loc[descriptions['cpc'] ==
                                           finalCPC]['description']
            if (len(description) > 0):
                descriptionsString = description.tolist()[0]
            descriptionsList.append(descriptionsString)
        descriptionsString = str(descriptionsList)
        CPCDescriptions.append(descriptionsString)
    return CPCDescriptions

예제 #10

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def category_statistics(request, data_set, classification, category):
    errors = []
    warnings = []
    form = None
    valid = False
    hasDataSet = False
    hasCategory = False
    nPPA = None
    categoryList = None
    selectedCategory = str(category).replace('_', '/')
    selectedCPCDescription = None
    selectedClass = ''
    selectedClassificationDisplay = ''
    maxYear = None
    assigneeData = None
    yearData = None
    targetAssigneeColumnName = "CA"
    targetYearColumnName = "YEAR"
    targetCategoryColumnName = "CATEGORY"
    previousNYears = 20

    # Data set selection view
    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    classificationNames = dbq.getClassificationList()
    classificationNames.insert(0, 'index')

    # Category selection view
    if (not data_set == 'index' and not classification == 'index'
            and selectedCategory == 'index'):
        df = pd.DataFrame()
        df = dbq.getDataSetPatentColumn(data_set, df, classification)
        selectedClass = df[classification].tolist()
        nPPA = len(df.index)
        allClass = []
        for cList in selectedClass:
            if (cList and cList == cList):
                for c in cList:
                    if (c and c == c and c != 'nan' and c != 'NAN'):
                        allClass.append(c.lstrip().rstrip())
        categoryList = sorted(list(set(allClass)))
        categoryList.insert(0, 'index')
        hasDataSet = True

    # Graph preparations
    elif (not data_set == 'index' and not classification == 'index'
          and not selectedCategory == 'index'):
        if (request.method == "POST"):
            previousNYears = int(request.POST.get('target-n-years'))
        df = pd.DataFrame()
        df = dbq.getDataSetPatentYears(data_set, df)
        nPPA = len(df.index)
        df = dbq.getDataSetPatentAssignees(data_set, df)
        df['Current Assignees'] = df['Clean Assignees'].tolist()
        df = dbq.getDataSetPatentTypes(data_set, df)
        df = dbq.getDataSetPatentColumn(data_set, df, classification)
        selectedClass = df[classification].tolist()

        years = df['Years']
        maxYear = max(years)
        minYear = maxYear - previousNYears + 1
        df = df[df.Years >= minYear]
        years = df['Years']
        CAs = df['Current Assignees']
        types = df['Types']
        allClass = []
        allYears = []
        allCAs = []
        allTypes = []
        for year, CA, cList, patentType in zip(years, CAs, selectedClass,
                                               types):
            if (cList == cList and cList != None):
                for c in cList:
                    if (c and c == c and c != 'nan'):
                        allClass.append(c.lstrip().rstrip())
                        allYears.append(year)
                        allCAs.append(CA)
                        allTypes.append(patentType)
        categoryList = sorted(list(set(allClass)))
        if (selectedCategory in categoryList):
            expandedDF = pd.DataFrame()
            expandedDF[targetCategoryColumnName] = allClass
            expandedDF['Years'] = allYears
            expandedDF[targetAssigneeColumnName] = allCAs
            expandedDF['Types'] = allTypes
            expandedDF.drop(
                expandedDF[expandedDF[targetCategoryColumnName] !=
                           selectedCategory.lstrip().rstrip()].index,
                inplace=True)
            nPPA = len(expandedDF.index)

            # Year Bar Graph Data
            yearData = pd.crosstab(expandedDF['Years'],
                                   [expandedDF['Types']]).reset_index()
            yearData = yearData.rename(index=str,
                                       columns={'Years': "Categories"})
            yearData = yearData.to_json(orient="index")
            # yearData = cim.getCrossTabInput(expandedDF, 'Years', 'Types', 10)

            # Assignee Bar Graph Data
            assigneeData = cim.getCrossTabInput(expandedDF,
                                                targetAssigneeColumnName,
                                                'Types', 10)

            hasDataSet = True
            hasCategory = True
            valid = True
        else:
            categoryList.insert(0, 'index')
            category = selectedCategory = 'index'
            hasDataSet = True
            hasCategory = False

    templateHTML = 'visualization/category_statistics.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'hasDataSet': hasDataSet,
            'hasCategory': hasCategory,
            'classification': classification,
            'selectedClassificationDisplay': selectedClassificationDisplay,
            'valid': valid,
            'errors': errors,
            'warnings': warnings,
            'nPPA': nPPA,
            'dataSetNames': dataSetNames,
            'classificationNames': classificationNames,
            'categoryList': categoryList,
            'data_set': data_set,
            'category': category,
            'selectedCategory': selectedCategory,
            'selectedCPCDescription': selectedCPCDescription,
            'maxYear': maxYear,
            'yearData': yearData,
            'assigneeData': assigneeData,
            'previousNYears': previousNYears,
        })
    return mainHTML

예제 #11

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def landscape_map(request, data_set):
    errors = []
    warnings = []
    form = None
    valid = False
    hasDataSet = False
    columnList = None
    dataSetNames = None
    topicsTableHTML = None
    stringUniqueAllTopics = None
    n_top_documents = 3
    documentsPerTopic = []
    selectedDataSet = data_set
    uniqueStringFeatureNames = None
    tsneData = None
    # colorList = ["#2D777F", "#173B40", "#2b9ca5", "#7cc9d2", "#5BEEFF", "#51D6E5", "#44B2BF", "#e5f2f9", "#E9EDDE", "#e2e2e2"]
    # colorList = ["#5BEEFF", "#d62728", "#2b9ca5", "#ff7f0e", "#756bb1", "#66aa00", "#1f77b4", "#bcbd22"]
    colorList = [
        "#ed7d2e", "#258296", "#6cc8db", "#194660", "#316e8c", "#2D777F",
        "#1f77b4", "#8a8a8a", "#756bb1", "#bcbd22"
    ]

    outFolderName = '../templates/visualization/'
    fileType = '.xlsx'
    scatterFileName = 'landscapeMapInput'
    scatterFileType = '.tsv'
    targetTitlesColumnName = 'Titles'
    targetAbstractsColumnName = 'Abstracts'
    targetIndependentClaimsColumnName = 'Independent Claims'

    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    # Model setup view
    if (not selectedDataSet == 'index' and not request.method == "POST"):
        hasDataSet = True

    # Graph preparation
    elif (request.method == "POST"):
        n_top_words = 12
        df = pd.DataFrame()
        df = dbq.getDataSetPatentTACs(data_set, df)
        if (len(df.index) > 1000):
            # df = df.sample(n=500, replace=False)
            df = df.sample(frac=0.1, replace=False)
        df = df.dropna(subset=[targetTitlesColumnName])
        df = df.dropna(subset=[targetAbstractsColumnName])
        df = df.dropna(subset=[targetIndependentClaimsColumnName])

        targetFeatures = request.POST.getlist('features')
        targetMethod = request.POST.get('method')
        targetNumberOfComponents = int(request.POST.get('target-n-components'))
        targetNGram1 = int(request.POST.get('target-n-gram-1'))
        targetNGram2 = int(request.POST.get('target-n-gram-2'))

        wordList = []
        f = open('../out/words.txt', 'r')
        for line in f:
            wordList.append(line.rstrip())
        f.close()

        featuresDF = pd.DataFrame()
        cleanTitles = cc.removePublicationNumbers(df[targetTitlesColumnName])
        # featuresDF[targetTitlesColumnName] = pp.normalizeCorpusAsStrings(cleanTitles, wordList)
        # featuresDF[targetAbstractsColumnName] = pp.normalizeCorpusAsStrings(df[targetAbstractsColumnName], wordList)
        # featuresDF[targetIndependentClaimsColumnName] = pp.normalizeCorpusAsStrings(df[targetIndependentClaimsColumnName], wordList)

        if ('titles' in targetFeatures):
            temp = cc.removePublicationNumbers(df[targetTitlesColumnName])
            temp = pp.normalizeCorpusAsStrings(temp, wordList)
            df[targetTitlesColumnName] = cc.remove2LetterWords(temp)
            featuresDF[targetTitlesColumnName] = df[targetTitlesColumnName]

        if ('abstracts' in targetFeatures):
            temp = cc.removePublicationNumbers(df[targetAbstractsColumnName])
            temp = pp.normalizeCorpusAsStrings(temp, wordList)
            df[targetAbstractsColumnName] = cc.remove2LetterWords(temp)
            featuresDF[targetAbstractsColumnName] = df[
                targetAbstractsColumnName]
        if ('independentclaims' in targetFeatures):
            temp = cc.removePublicationNumbers(
                df[targetIndependentClaimsColumnName])
            temp = pp.normalizeCorpusAsStrings(temp, wordList)
            temp = cc.remove2LetterWords(temp)
            df[targetIndependentClaimsColumnName] = cc.removeDigits(temp)
            featuresDF[targetIndependentClaimsColumnName] = df[
                targetIndependentClaimsColumnName]

        transformerList = []
        if ('titles' in targetFeatures):
            transformerList.append(
                ('titles',
                 Pipeline([
                     ('selector',
                      ise.ItemSelector(key=targetTitlesColumnName)),
                     ('vectorizer',
                      TfidfVectorizer(stop_words='english',
                                      lowercase=True,
                                      ngram_range=(targetNGram1,
                                                   targetNGram2))),
                 ])))
        if ('abstracts' in targetFeatures):
            transformerList.append(
                ('abstracts',
                 Pipeline([
                     ('selector',
                      ise.ItemSelector(key=targetAbstractsColumnName)),
                     ('vectorizer',
                      TfidfVectorizer(stop_words='english',
                                      lowercase=True,
                                      ngram_range=(targetNGram1,
                                                   targetNGram2))),
                 ])))
        if ('independentclaims' in targetFeatures):
            transformerList.append(
                ('independent claims',
                 Pipeline([
                     ('selector',
                      ise.ItemSelector(key=targetIndependentClaimsColumnName)),
                     ('vectorizer',
                      TfidfVectorizer(stop_words='english',
                                      lowercase=True,
                                      ngram_range=(targetNGram1,
                                                   targetNGram2))),
                 ])))

        # Model fitting
        pipeline = Pipeline([
            ('union', FeatureUnion(transformer_list=transformerList, )),
            ('clf',
             LatentDirichletAllocation(n_components=targetNumberOfComponents,
                                       random_state=10,
                                       doc_topic_prior=.1,
                                       learning_method='online',
                                       learning_offset=50,
                                       max_iter=random.randint(1, 6))),
            # ('clf', LatentDirichletAllocation(
            #     n_components=targetNumberOfComponents,
            #     random_state=0,
            #     learning_method='online',
            #     learning_offset=50,
            #     max_iter=5)),
            # ('clf', NMF(n_components=10, random_state=1,
            #       beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
            #       l1_ratio=.5)),
        ])

        pipeline.fit(featuresDF)
        model = pipeline.named_steps['clf']
        wordToTopics = model.components_ / model.components_.sum(
            axis=1)[:, np.newaxis]
        topicToDocuments = pipeline.transform(featuresDF)
        words = []
        for transformer in pipeline.steps[0][1].transformer_list:
            words = words + transformer[1].named_steps[
                'vectorizer'].get_feature_names()

        topicsDF = pd.DataFrame()
        allTopics = []
        stringAllTopics = []
        stringUniqueAllTopics = []
        # allImportances = []
        allDocuments = []
        documentsPerTopic = []
        for topic_idx, topic in enumerate(wordToTopics):
            featureNames = [
                words[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]
            stringFeatureNames = str(featureNames).replace("'", '').replace(
                "[", '').replace("]", '')
            uniqueStringFeatureNames = list(
                set(stringFeatureNames.replace(",", '').split(' ')))
            stringAllTopics.append(stringFeatureNames)
            stringUniqueAllTopics.append(
                str(uniqueStringFeatureNames).replace("'", '').replace(
                    "[", '').replace("]", '').replace(",", ''))
            # featureImportances = [importance for importance in topic.argsort()[:n_top_words]]
            top_doc_indices = np.argsort(
                topicToDocuments[:, topic_idx])[::-1][0:n_top_documents]
            tempDocumentsPerTopic = []
            for doc_index in top_doc_indices:
                allTopics.append(featureNames)
                # allImportances.append(featureImportances)
                tempDocumentsPerTopic.append(cleanTitles[doc_index])
                allDocuments.append(cleanTitles[doc_index])
            documentsPerTopic.append(tempDocumentsPerTopic)
        topicsDF['Topics'] = allTopics
        topicsTable = pd.DataFrame()
        topicsTable['Topics'] = stringUniqueAllTopics
        topicsTableHTML = topicsTable.to_html(index=False)
        stringUniqueAllTopics = zip(colorList, stringUniqueAllTopics)
        documentsPerTopic = zip(colorList, documentsPerTopic)
        # topicsDF['Importances'] = allImportances
        topicsDF['Documents'] = allDocuments
        # topicsDF.to_excel(outFolderName + 'LDA2' + fileType)

        topWords = []
        for topic_idx, topic in enumerate(wordToTopics):
            featureNames = [
                words[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]
            topWords.append(
                str(featureNames).replace('[',
                                          '').replace(']',
                                                      '').replace("'", ''))

        mostProbableTopicsIndex = [0] * len(wordToTopics[0])
        mostProbableTopics = [0] * len(wordToTopics[0])
        for word_idx, topics in enumerate(np.transpose(wordToTopics)):
            mostProbableTopicsIndex[word_idx] = np.argmax(topics)
            mostProbableTopics[word_idx] = topWords[np.argmax(topics)]

        # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
        # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca', perplexity=30, early_exaggeration=5)
        tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
        tsne_lda = tsne_model.fit_transform(np.transpose(wordToTopics))
        # tsne_lda = tsne_model.fit_transform(topicToDocuments)
        xCoords = tsne_lda[:, 0]
        yCoords = tsne_lda[:, 1]
        tsnseDF = pd.DataFrame()
        tsnseDF['x'] = xCoords
        tsnseDF['y'] = yCoords
        tsnseDF['n'] = mostProbableTopicsIndex
        tsnseDF['label'] = mostProbableTopics
        tsnseDF.to_csv(outFolderName + scatterFileName + scatterFileType,
                       sep='\t',
                       index=False)
        tsneData = tsnseDF.to_json(orient="index")

        hasDataSet = True
        valid = True

    templateHTML = 'visualization/landscape_map.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'valid': valid,
            'hasDataSet': hasDataSet,
            'errors': errors,
            'warnings': warnings,
            'columnList': columnList,
            'data_set': data_set,
            'dataSetNames': dataSetNames,
            'selectedDataSet': selectedDataSet,
            'stringUniqueAllTopics': stringUniqueAllTopics,
            'documentsPerTopic': documentsPerTopic,
            'n_top_documents': n_top_documents,
            'topicsTableHTML': topicsTableHTML,
            'tsneData': tsneData,
        })
    return mainHTML

예제 #12

0

파일 보기

파일: HomeMenu.py 프로젝트: joshuaba/CPS-408-Spring-2021

def mainMenu():
    # ascii_banner = pyfiglet.figlet_format("Welcome to the assignment database")
    # print(ascii_banner)

    custom_figlet = Figlet(width=250)

    print(custom_figlet.renderText("Welcome to the assignment database"))

    while(True):

        print() # formatting

        print("You have the following options:\n\
                  1: Display all assignments currently in the database\n\
                  2: Display records from the other tables\n\
                  3: Filter assignments by certain parameters (date due, instructor, class, department, etc.)\n\
                  4: Add in a new record\n\
                  5: Delete a record\n\
                  6: Create a separate table with specified information\n\
                  7: Generate table to CSV file\n\
                  8: Undo all previous changes\n\
                  9: Commit all current changes (NOTE: ONCE YOU SELECT THIS OPTION ALL CURRENT CHANGES WILL BE WRITTEN TO THE DB AND THE ACTION CANNOT BE UNDONE")

        print() # formatting

        userOption = input("Please input the number associated with the option you would like to perform (type \"Q\" or \"q\" to exit the program: ")

        if((userOption) == 'Q' or userOption == 'q'):
            print() #formatting
            exit_banner = pyfiglet.figlet_format("Exiting")
            print(exit_banner)
            break

        if(userOption.isnumeric()):
            userOption = int(userOption)
        else:
            print() # formatting
            invalidInputFiglet = Figlet(width=250)
            print(invalidInputFiglet.renderText("Invalid input. Please type a value from 1 - 9"))

        if(userOption == 1):
            EstablishConnection.printAllAssignments() # print out all of the assignments currently in the database
        elif(userOption == 2):
            repeat = True
            while(repeat):
                tableOutput = int(input("From which table would you like to print the records: \n\
                                      1: Faculty\n\
                                      2: Courses\n\
                                      3: Department\n\
                                      4: Schools\n\
                                      Option: "))
                if(tableOutput == 1):
                    EstablishConnection.displayFacultyData()
                    repeat = False
                elif(tableOutput == 2):
                    EstablishConnection.displayCourseData()
                    repeat = False
                elif(tableOutput == 3):
                    EstablishConnection.displayDepartmentData()
                    repeat = False
                elif(tableOutput == 4):
                    EstablishConnection.displaySchoolData()
                    repeat = False
                else:
                    print("Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records")

        elif(userOption == 3):
            detailedView = input("Would you like to see a detailed view of the assignments (as in instructor of the course, course name, etc.? \n\
            Typing \'N\' will only print out the Assignment and its related details rather than a detailed list including instructor details, course details, etc: ")

            if(detailedView == 'Y' or detailedView == 'y'):
                repeat = True
                while (repeat):
                    parameterChosen = int(input("Which details would you like to print/output: \n\
                                             1: Assignments, Courses, and Department\n\
                                             2: Assignments, Courses, Faculty\n\
                                             3: Assignments, Courses, Department, and Specific College\n\
                                             4: Assignments and Courses Only \n\
                                             Option: "))
                    if (parameterChosen == 1):
                        departmentName = input("Name of Department (case sensitive): ")
                        DBQueries.printAssignmentsCoursesDepartmentInfo(departmentName)
                        repeat = False
                    elif (parameterChosen == 2):
                        facultyName = input("Name of faculty member (FirstName LastName with a space): ")
                        DBQueries.printAssignmentsCoursesFacultyInfo(facultyName)
                        repeat = False
                    elif (parameterChosen == 3):
                        collegeName = input("Name of college from which you would like to filter the assignments outstanding: ")
                        # InsertValues.filterAssignmentsOutstandingByDepartment(departmentName)
                        DBQueries.printAssignmentsCoursesDepartmentCollegeInfo(collegeName)
                        repeat = False
                    elif (parameterChosen == 4):
                        courseName = input("Name of course from which you would like to filter the assignments outstanding: ")
                        DBQueries.printAssignmentsCoursesInfo(courseName)
                        repeat = False
                    else:
                        print(
                            "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records")
            else:
                repeat = True
                while(repeat):
                    parameterChosen = int(input("By which parameter would you like to filter the assignments outstanding: \n\
                                     1: Faculty\n\
                                     2: Courses\n\
                                     3: Department\n\
                                     4: Schools \n\
                                     Option: "))

                    if (parameterChosen == 1):
                        facultyName = input("Name of faculty member: MUST BE in FirstName LastName format (with a space) ")
                        DBQueries.filterOutstandingAssignmentsByFaculty(facultyName)
                        repeat = False
                    elif (parameterChosen == 2):
                        courseName = input("Name of course: ")
                        DBQueries.filterOutstandingAssignmentsByCourse(courseName)
                        repeat = False
                    elif (parameterChosen == 3):
                        departmentName = input("Name of department: ")
                        # InsertValues.filterAssignmentsOutstandingByDepartment(departmentName)
                        DBQueries.filterOutstandingAssignmentsByDepartment(departmentName)
                        repeat = False
                    elif (parameterChosen == 4):
                        schoolName = input("School/College Name: ")
                        DBQueries.filterOutstandingAssignmentsBySchool(schoolName)
                        repeat = False
                    else:
                        print(
                            "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records")

        elif(userOption == 4):
            repeat = True
            while (repeat):
                tableToAdd = int(input("Please input the option of the table to which you wish to add: \n\
                                             1: Assignments\n\
                                             2: Faculty\n\
                                             3: Courses\n\
                                             4: Department\n\
                                             5: Schools\n\
                                             Option: "))

                if (tableToAdd == 1):
                    InsertDeleteValues.theAssignmentDetails()
                    repeat = False
                elif (tableToAdd == 2):
                    InsertDeleteValues.theFacultyDetails()
                    repeat = False
                elif (tableToAdd == 3):
                    InsertDeleteValues.theCourseDetails()
                    repeat = False
                elif (tableToAdd == 4):
                    InsertDeleteValues.theDepartmentDetails()
                    repeat = False
                elif (tableToAdd == 5):
                    InsertDeleteValues.universitySchoolsDetails()
                    repeat = False
                else:
                    print("Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records")

        elif(userOption == 5):
            repeat = True
            while (repeat):
                tableToAdd = int(input("Please input the option of the table to which you wish to delete a record: \n\
                                        1: Assignment\n\
                                        2: Faculty\n\
                                        3: Courses\n\
                                        4: Department\n\
                                        5: Schools\n\
                                        Option: "))

                if (tableToAdd == 1):
                    InsertDeleteValues.deleteFromAssignments()
                    repeat = False
                elif (tableToAdd == 2):
                    InsertDeleteValues.deleteFromFaculty()
                    repeat = False
                elif (tableToAdd == 3):
                    InsertDeleteValues.deleteFromCourses()
                    repeat = False
                elif (tableToAdd == 4):
                    InsertDeleteValues.deleteFromDepartment()
                    repeat = False
                elif (tableToAdd == 5):
                    InsertDeleteValues.deleteFromUniversitySchools()
                else:
                    print(
                        "Unrecognized input. Please type a value from 1 - 4, depending on the table from which you would like to print the records")

        elif(userOption == 6):
            repeat = True
            while(repeat):
                specifics = input("From which tables would you like to pull information: \n\
                                 1: Assignments and Courses\n\
                                 2: Assignments and Faculty\n\
                                 3: Assignments and Department\n\
                                 4: Assignments and College\n\
                                 Option: ")
                if(int(specifics) == 1):
                    CreateViews.CreateViewAssignmentsCourses()
                    repeat = False

                elif(int(specifics) == 2):
                    CreateViews.CreateViewAssignmentsFaculty()
                    repeat = False

                elif(int(specifics) == 3):
                    CreateViews.CreateViewAssignmentsDepartment()
                    repeat = False

                elif(int(specifics) == 4):
                    CreateViews.CreateViewAssignmentsCollege()
                    repeat = False

                else:
                    print("Invalid input. Please type a value from 1 - 4")

        elif(userOption == 7):
            repeat = True
            while (repeat):
                selection = input("Which table would you like to have a report of? Type \"exit\" to quit (Assignments, Courses, Departments, Faculty, Schools)")
                if selection.lower() == "assignments":
                    EstablishConnection.returnAssignments()
                elif selection.lower() == "courses":
                    EstablishConnection.returnCourses()
                elif selection.lower() == "departments":
                    EstablishConnection.returnDepartment()
                elif selection.lower() == "faculty":
                    EstablishConnection.returnFaculty()
                elif selection.lower() == "schools":
                    EstablishConnection.returnSchool()
                elif selection.lower() == "exit":
                    repeat = False
                else:
                    print("Invalid input. Type \'exit\' to quit.")


        elif(userOption == 8):
            InsertDeleteValues.RollbackAction() # Rollback all of the actions the user has taken
            print()
            print("Changes rolled back")

        elif(userOption == 9):
            InsertDeleteValues.CommitAction() # Commit all of the actions the user has taken
            print()
            print("Changes committed")

예제 #13

0

파일 보기

def assignee_grouping(request, data_set):
    form = None
    errors = []
    warnings = []
    valid = False
    inputFile = None
    referenceTableHTML = None
    assigneeGroupingHTML = None
    assigneeGroupingSummaryHTML = None
    sampleFileHTML = None
    dataSetNames = []
    
    outFileName = 'Assignee Grouping'
    outFolderName = '../out/'
    fileType = '.xlsx'
    sheetName = 'Sheet1'

    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    if(not data_set == 'index'):
        df = pd.DataFrame()
        df = dbq.getDataSetPatentAssignees(data_set, df)
        keywordsDF = dbq.getAllAssigneeKeywords()
        df = pp.updateAssigneeGrouping(df, keywordsDF)
        dbq.updateCleanCurrentAssignees(data_set, df)
        df.to_excel(outFolderName + outFileName + fileType)
        assigneeGroupingHTML = df.head(10).to_html(index=False)
        valid = True
    #     for CAs in df[targetColumnName].tolist():
    #         if(CAs == CAs):
    #             tempCAList = []
    #             CAList = str(CAs).splitlines()
    #             for CA in CAList:
    #                 group = pp.getAssgineeGroup(CA, referenceDF)
    #                 if(CA.lower().lstrip().rstrip() != group.lower().lstrip().rstrip()):
    #                     allCounts[allGroups.index(group)] = allCounts[allGroups.index(group)] + 1
    #                 tempCAList.append(group)
    #             tempCAListString = str(tempCAList).lstrip().rstrip().replace("'", '')
    #             newCAs.append(tempCAListString)
    #         else:
    #             newCAs.append(None)
    #     df['Assignee Group'] = newCAs
    #     referenceDF['Counts'] = allCounts
    # df.to_excel(outFolderName + outFileName + fileType)
    # assigneeGroupingHTML = df.to_html(index=False)
    # referenceDF = referenceDF.drop(columns = ['Contains',  'Does Not Contain'])
    # assigneeGroupingSummaryHTML = referenceDF.to_html(index=False)

    templateHTML = 'parola_refine/assignee_grouping.html'
    mainHTML = render_to_string(
        templateHTML, {
        'form': form,
        'valid': valid,
        'errors': errors,
        'warnings': warnings,
        'data_set': data_set,
        'dataSetNames': dataSetNames,
        'assigneeGroupingHTML': assigneeGroupingHTML,
    })
    return mainHTML

예제 #14

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def cluster_map(request, data_set, classification1, classification2):
    errors = []
    warnings = []
    form = None
    valid = False
    hasDataSet = False
    clusterData = None
    dataSetNames = []
    selectedClassificationDisplay = ''
    selectedClass = ''

    minNodeSize = 99999
    maxNodeSize = 0
    minEdgeWeight = 99999
    maxEdgeWeight = 0
    minNodeSizeWithLabel = 20
    maxNNodes = 20
    topN = 10
    previousNYears = 20

    targetCPCColumnName = 'CPCs'
    outFileName = 'clusterMapInput'
    outFolderName = '../templates/visualization/'
    fileType = '.json'

    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    classificationNames = dbq.getClassificationList()
    classificationNames.insert(0, 'index')

    # Model setup view
    if (not (data_set == 'index' or classification1 == 'index'
             or classification2 == 'index'
             or classification1 == classification2)):
        # df = dbq.getDataSetPatents(data_set)
        # if(len(df.index)>1000):
        #     df = df.sample(n=500, replace=False, random_state=17)]
        df = pd.DataFrame()
        df = dbq.getDataSetPatentColumn(data_set, df, classification1)
        selectedClass1 = df[classification1].tolist()
        df = dbq.getDataSetPatentColumn(data_set, df, classification2)
        selectedClass2 = df[classification2].tolist()
        df = dbq.getDataSetPatentYears(data_set, df)
        years = df['Years']
        maxYear = max(years)
        minYear = maxYear - previousNYears + 1
        df = df[df.Years >= minYear]

        allCategories = []
        uniqueCategories1 = []
        uniqueCategories2 = []
        combinedCategories = []

        allClass1 = []
        allClass2 = []
        for cList1 in selectedClass1:
            if (cList1 == cList1 and cList1 != None):
                for c in cList1:
                    if (c != 'nan' and c != '' and c != 'NAN' and c != 'Nan'):
                        allClass1.append(c)
        for cList2 in selectedClass2:
            if (cList2 == cList2 and cList2 != None):
                for c in cList2:
                    if (c != 'nan' and c != '' and c != 'NAN' and c != 'Nan'):
                        allClass2.append(c)
        expandedDF1 = pd.DataFrame()
        expandedDF1[classification1] = allClass1
        expandedDF2 = pd.DataFrame()
        expandedDF2[classification2] = allClass2

        grouped = expandedDF1.groupby([classification1
                                       ]).size().reset_index(name='nPPA')
        topNClassification1 = grouped.nlargest(
            topN, 'nPPA')[classification1].tolist()
        grouped = expandedDF2.groupby([classification2
                                       ]).size().reset_index(name='nPPA')
        topNClassification2 = grouped.nlargest(
            topN, 'nPPA')[classification2].tolist()

        # for c2, c3, c4 in zip(categories2, categories3, categories4):
        for c1, c2 in zip(selectedClass1, selectedClass2):
            if (not c1):
                c1 = []
            if (not c2):
                c2 = []
            c1 = [c for c in c1 if c in topNClassification1]
            c2 = [c for c in c2 if c in topNClassification2]
            allCategories = allCategories + c1 + c2
            combinedCategories.append(c1 + c2)
            uniqueCategories1 = uniqueCategories1 + c1
            uniqueCategories2 = uniqueCategories2 + c2
        uniqueCategories1 = list(set(uniqueCategories1))
        uniqueCategories2 = list(set(uniqueCategories2))
        # expanded = pd.DataFrame()
        # expanded['Categories'] = allCategories
        # categorySizes = expanded.groupby(['Categories']).size().reset_index(name='nPPA')
        # categoryList = categorySizes['Categories'].tolist()
        # categorySizesList = categorySizes['nPPA'].tolist()

        selectedClass = combinedCategories
        # selectedClass = categoryByKeywords

        # wordList = []
        # f = open('../out/words.txt', 'r')
        # for line in f:
        #     wordList.append(line.rstrip())
        # f.close()
        # titleWords = pp.normalizeCorpus(df['Titles'].tolist(), wordList)

        allClass = []
        for c in selectedClass:
            if (c):
                allClass = allClass + list(filter(lambda a: a != '', c))
        expanded = pd.DataFrame()
        expanded['Class'] = allClass
        classSizes = expanded.groupby(['Class'
                                       ]).size().reset_index(name='nPPA')
        classList = classSizes['Class'].tolist()
        classSizesList = classSizes['nPPA'].tolist()

        grouped = expanded.groupby(
            ['Class']).size().reset_index(name='Number of P/PA')
        topNClass = grouped.nlargest(10, 'Number of P/PA')['Class'].tolist()

        # Cleaning of CPC
        relationships = selectedClass
        relationshipsEval = []

        if (maxNNodes > 0):
            topNNodes = v.getTopNNodes(relationships, maxNNodes)
            for rList in relationships:
                tempRList = []
                for node in list(filter(lambda a: a != '', rList)):
                    if (node in topNNodes):
                        tempRList.append(node)
                relationshipsEval.append(tempRList)
        else:
            for rList in relationships:
                relationshipsEval.append(list(filter(lambda a: a != '',
                                                     rList)))
        source = []
        target = []
        weight = []
        for r in relationshipsEval:
            pairs = combinations(r, 2)
            for p in pairs:
                if ((p[0] in uniqueCategories1 and p[1] in uniqueCategories1)
                        or
                    (p[0] in uniqueCategories2 and p[1] in uniqueCategories2)):
                    continue
                else:
                    source.append(p[0])
                    target.append(p[1])
                    weight.append(1)
                # source.append(p[0])
                # target.append(p[1])
                # weight.append(1)

        newDF = pd.DataFrame()
        newDF['source'] = source
        newDF['target'] = target
        newDF['weight'] = weight

        graphDF = newDF.groupby(['source', 'target']).sum().reset_index()
        maxEdgeWeight = graphDF['weight'].max()
        minEdgeWeight = graphDF['weight'].min()
        # graphDF.to_excel(outFolderName + 'edgelist.xlsx')
        G = nx.from_pandas_edgelist(graphDF, 'source', 'target', 'weight')
        G2 = nx.convert_node_labels_to_integers(G, label_attribute='name')

        # Determine node groups using Louvain modularity
        # communities = best_partition(G2, weight='size')
        d = nx.readwrite.json_graph.node_link_data(G2, {'name': 'index'})
        nodeNames = []
        nodeCommunities = []
        nodeSizes = []
        nodeTop10 = []
        for node in d['nodes']:
            name = node['name']
            # size = G2.degree[list(G.nodes()).index(node['name'])]
            size = classSizesList[classList.index(node['name'])]
            community = 2
            if (name in uniqueCategories1):
                community = 0
            if (name in uniqueCategories2):
                community = 1
            # community = communities[list(G.nodes()).index(node['name'])]
            node['size'] = size
            node['group'] = community
            nodeNames.append(name)
            nodeSizes.append(size)
            nodeCommunities.append(community)
            name = node['name']
            if (node['size'] < minNodeSize):
                minNodeSize = node['size']
            if (node['size'] > maxNodeSize):
                maxNodeSize = node['size']
        # minNodeSizeWithLabel = 0.2 * maxNodeSize
        # for node in d['nodes']:
        #     if(node['size'] < minNodeSizeWithLabel):
        #         node['name'] = None
        for node in d['nodes']:
            if (not node['name'] in topNClass):
                node['fontSize'] = 8
                node['opacity'] = 0.5
            else:
                node['fontSize'] = node['size']
                node['opacity'] = 1

        nodesDF = pd.DataFrame()
        nodesDF['CPC'] = nodeNames
        nodesDF['Size'] = nodeSizes
        nodesDF['Community'] = nodeCommunities

        del d["directed"]
        del d["multigraph"]
        del d["graph"]
        clusterData = d
        hasDataSet = True
        valid = True

    templateHTML = 'visualization/cluster_map.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'valid': valid,
            'errors': errors,
            'warnings': warnings,
            'data_set': data_set,
            'classification1': classification1,
            'classification2': classification2,
            'classificationNames': classificationNames,
            'selectedClassificationDisplay': selectedClassificationDisplay,
            'hasDataSet': hasDataSet,
            'dataSetNames': dataSetNames,
            'minNodeSize': minNodeSize,
            'maxNodeSize': maxNodeSize,
            'maxEdgeWeight': maxEdgeWeight,
            'minEdgeWeight': minEdgeWeight,
            'clusterData': clusterData,
            'previousNYears': previousNYears,
        })
    return mainHTML

예제 #15

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def dataset_statistics(request, data_set, classification):
    nPPA = None
    CPCLegend = None
    selectedClassificationDisplay = ''
    selectedClass = ''
    minCount = 1
    maxCount = 15
    minCount2 = 1
    maxCount2 = 15
    minCount3 = 1
    maxCount3 = 15
    targetN = 12
    previousNYears = 20
    yearData = None
    areaData = None
    categoryData = None
    categoryPercentData = None
    categoryYearData = None
    assigneeData = None
    assigneeYearData = None
    categoryAssigneeData = None
    sunBurstData = None
    smallMultipleData = None

    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    classificationNames = dbq.getClassificationList()
    classificationNames.insert(0, 'index')

    if (data_set != 'index'):
        if (request.method == "POST"):
            targetN = int(request.POST.get('target-n'))
            previousNYears = int(request.POST.get('target-n-years'))

        # df = dbq.getDataSetPatents(data_set)
        df = pd.DataFrame()
        df = dbq.getDataSetPatentYears(data_set, df)
        df = dbq.getDataSetPatentAssignees(data_set, df)
        assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry()
        df['Current Assignees'] = df['Clean Assignees'].tolist()
        df = dbq.getDataSetPatentTypes(data_set, df)
        df = dbq.getDataSetPatentColumn(data_set, df, classification)
        years = df['Years']
        maxYear = max(years)
        minYear = maxYear - previousNYears + 1
        df = df[df.Years >= minYear]
        df = df[df['Current Assignees'] != '']
        years = df['Years']
        CAs = df['Current Assignees']
        types = df['Types']
        selectedClass = df[classification].tolist()
        nPPA = len(df.index)
        topNAssignees = pd.crosstab(
            df['Current Assignees'], [df['Types']], margins=True).sort_values(
                by=['All'],
                ascending=False).reset_index().drop(['All'], axis=1).drop(
                    [0]).head(targetN)['Current Assignees'].tolist()

        dfCopy = df.copy()
        smallMultipleData = dfCopy.groupby([
            'Years', 'Current Assignees'
        ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA')
        smallMultipleData = smallMultipleData[
            smallMultipleData['Current Assignees'].isin(topNAssignees)]
        smallMultipleData = smallMultipleData.rename(
            index=str, columns={'Current Assignees': "CurrentAssignees"})
        smallMultipleData = smallMultipleData.to_json(orient="index")

        dfCopy = df.copy()
        dataSetSource = Datasets.objects.filter(name=data_set)[0].source
        dfCopy = pp.assignAssigneeSectorIndustry(dfCopy,
                                                 assigneeSectorsIndustries,
                                                 dataSetSource)
        dfCopy = dfCopy.dropna(subset=['Sectors'])
        dfCopy = dfCopy.dropna(subset=['Industries'])
        dfCopy = dfCopy[['Sectors', 'Industries', 'ids']]
        sectorsIndustriesCounts = dfCopy.groupby(
            ['Sectors', 'Industries']).size().reset_index(name='nPPA')

        d = dict()
        d = {"name": "flare", "children": []}
        for line in sectorsIndustriesCounts.values:
            the_parent = line[0]
            the_child = line[1]
            child_size = line[2]
            keys_list = []
            for item in d['children']:
                keys_list.append(item['name'])
            if not the_parent in keys_list:
                d['children'].append({
                    "name":
                    the_parent,
                    "children": [{
                        "name": the_child,
                        "size": child_size
                    }]
                })
            else:
                d['children'][keys_list.index(the_parent)]['children'].append({
                    "name":
                    the_child,
                    "size":
                    child_size
                })
        sunBurstData = d

        allYears = []
        allCAs = []
        allClass = []
        allTypes = []
        for cList, year, CA, patentType in zip(selectedClass, years, CAs,
                                               types):
            if (cList == cList and cList != None):
                for c in cList:
                    if (c != 'nan' and c != '' and c != 'NAN'):
                        allClass.append(c)
                        allYears.append(year)
                        allCAs.append(CA)
                        allTypes.append(patentType)
        expandedDF = pd.DataFrame()
        expandedDF['Categories'] = allClass
        expandedDF['Years'] = allYears
        expandedDF['Current Assignees'] = allCAs
        expandedDF['Types'] = allTypes

        # Number of PPA per year for line graph
        yearData = cim.getGroupByInput(df, 'Years', 'nPPA')

        # Number of PPA per year for area chart
        # areaData = cim.getGroupByInput2(df, 'Years', 'Types', 'nPPA')
        counts = df.groupby([
            'Years', 'Types'
        ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA')
        areaData = counts.to_json(orient="index")

        # top 10 category counts
        categoryData = cim.getCrossTabInput(expandedDF, 'Categories', 'Types',
                                            targetN)
        categoryCounts = pd.crosstab(expandedDF['Categories'],
                                     [expandedDF['Types']],
                                     margins=True).sort_values(
                                         'All', ascending=False).reset_index()
        categoryCounts = categoryCounts.drop(['All'],
                                             axis=1).drop([0]).head(targetN)
        if (classification == 'cpc'):
            CPCLegend = zip(
                categoryCounts['Categories'].tolist(),
                pp.getCPCDescription(categoryCounts['Categories'].tolist()))

        categoryPercentages = expandedDF.groupby(
            ['Categories']).size().reset_index(name='percent')
        total = categoryPercentages['percent'].sum()
        categoryPercentages[
            'percent'] = categoryPercentages['percent'] / total * 100
        categoryPercentages['percent'] = categoryPercentages['percent'].round(
            decimals=1)
        categoryPercentages = categoryPercentages.head(targetN)
        categoryPercentData = categoryPercentages.to_json(orient="index")

        # top 10 categories and their PPA counts per year
        grouped = expandedDF.groupby(
            ['Categories']).size().reset_index(name='Number of P/PA')
        topNCategories = grouped.nlargest(
            targetN, 'Number of P/PA')['Categories'].tolist()
        expandedDF = expandedDF[expandedDF['Categories'].isin(topNCategories)]
        expandedDF = expandedDF.sort_values(['Years', 'Categories'],
                                            ascending=False)
        hm = expandedDF.groupby(['Years',
                                 'Categories']).size().reset_index(name='nPPA')
        hm = hm.sort_values(['Years'], ascending=True)
        hm = hm.sort_values(['nPPA'], ascending=False)
        hm = hm[hm.Years != 0]
        maxCount = hm['nPPA'].max()
        minCount = hm['nPPA'].min()
        categoryYearData = hm.to_json(orient="index")

        # top assignees
        assigneeData = cim.getCrossTabInput(df, 'Current Assignees', 'Types',
                                            targetN)
        # grouped = df.groupby(['Current Assignees']).size().reset_index(name='Number of P/PA')
        # topNAssignees = grouped.nlargest(targetN, 'Number of P/PA')['Current Assignees'].tolist()

        # top 10 assignees and their PPA counts per year
        dfCopy = df.copy()
        dfCopy = dfCopy[dfCopy['Current Assignees'].isin(topNAssignees)]
        hm = dfCopy.groupby(['Years', 'Current Assignees'
                             ]).size().reset_index(name='nPPA')
        hm = hm[hm.Years != 0]
        maxCount2 = hm['nPPA'].max()
        minCount2 = hm['nPPA'].min()
        hm = hm.rename(index=str, columns={'Current Assignees': "Categories"})
        tempDF = pd.DataFrame(columns=['Years', 'Categories', 'nPPA'])
        tempDF['Categories'] = topNAssignees
        tempDF['Years'] = 9999
        tempDF['nPPA'] = 0
        tempDF = tempDF.append(hm).reset_index()
        assigneeYearData = tempDF.to_json(orient="index")

        # top 10 categories and top 10 assignees
        expandedDF = expandedDF[expandedDF['Current Assignees'].isin(
            topNAssignees)]
        hm = expandedDF.groupby(['Categories', 'Current Assignees'
                                 ]).size().reset_index(name='nPPA')
        # hm = hm.sort_values(['CurrentAssignees'], ascending=True)
        hm = hm.sort_values(['nPPA'], ascending=False)
        hm = hm.rename(index=str,
                       columns={'Current Assignees': "CurrentAssignees"})
        tempDF = pd.DataFrame(
            columns=['Categories', 'CurrentAssignees', 'nPPA'])
        tempDF['CurrentAssignees'] = topNAssignees
        tempDF['Categories'] = "9999"
        tempDF['nPPA'] = 0
        tempDF = tempDF.append(hm).reset_index()
        categoryAssigneeData = tempDF.to_json(orient="index")
        maxCount3 = hm['nPPA'].max()
        minCount3 = hm['nPPA'].min()

    templateHTML = 'visualization/dataset_statistics.html'
    mainHTML = render_to_string(
        templateHTML, {
            'data_set': data_set,
            'classification': classification,
            'selectedClassificationDisplay': selectedClassificationDisplay,
            'dataSetNames': dataSetNames,
            'classificationNames': classificationNames,
            'minCount': minCount,
            'maxCount': maxCount,
            'minCount2': minCount2,
            'maxCount2': maxCount2,
            'minCount3': minCount3,
            'maxCount3': maxCount3,
            'nPPA': nPPA,
            'CPCLegend': CPCLegend,
            'targetN': targetN,
            'previousNYears': previousNYears,
            'yearData': yearData,
            'areaData': areaData,
            'categoryData': categoryData,
            'categoryPercentData': categoryPercentData,
            'categoryYearData': categoryYearData,
            'assigneeData': assigneeData,
            'assigneeYearData': assigneeYearData,
            'categoryAssigneeData': categoryAssigneeData,
            'sunBurstData': sunBurstData,
            'smallMultipleData': smallMultipleData
        })
    return mainHTML

예제 #16

0

파일 보기

파일: RunFlask.py 프로젝트: willemneal/githubanalytics

def listlanguages():
    reponame = bleach.clean(request.args['a']).strip()
    #TODO: Handle empty reponame
    Languages = DBQueries.LanguageBreakdown(reponame)
    return jsonify(languages=Languages)

예제 #17

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def general_trends(request, date):
    dates = []
    previousDate = None
    yearData = None
    targetN = 10

    if (request.method == "POST"):
        targetN = int(request.POST.get('target-n'))

    df = pd.DataFrame()
    df = dbq.getDataSetPatentsBySource(df, 'uspto')
    assigneeSectorsIndustries = dbq.getAssigneeSectorIndustry()

    for d in list(set(df['Dates'].tolist())):
        dates.append(str(d))
    dates.sort(reverse=True)
    date = dates[0]
    previousDate = dates[1]

    # Number of PPA per week for line graph
    yearData = cim.getGroupByInput(df, 'Dates', 'nPPA')

    # Assignee data preparation
    dfCopy = df.copy()
    dfCopy = dfCopy[dfCopy['Dates'].isin([date, previousDate])]
    dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries,
                                             'uspto')
    dfCopy = dfCopy.dropna(subset=['Sectors'])
    dfCopy['Current Assignees'] = dfCopy['Clean Assignees']
    dfCopy = dfCopy[['Dates', 'Current Assignees', 'ids']]
    assignees = list(set(dfCopy['Current Assignees'].tolist()))
    counts = dfCopy.groupby([
        'Dates', 'Current Assignees'
    ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA')
    countBefore = counts['nPPA'].tolist()[:len(assignees)]
    countAfter = counts['nPPA'].tolist()[len(assignees):]
    differences = counts.groupby(['Current Assignees']).diff()[len(assignees):]
    totals = counts.groupby(['Current Assignees']).sum()
    assigneeData = pd.DataFrame()
    assigneeData['Current Assignees'] = assignees
    assigneeData = assigneeData.sort_values(by='Current Assignees')
    assigneeData['Before'] = countBefore
    assigneeData['After'] = countAfter
    assigneeData['Total'] = totals['nPPA'].tolist()
    assigneeData['Change'] = differences['nPPA'].tolist()
    assigneeData[
        'PercentChange'] = assigneeData['Change'] / assigneeData['Total'] * 100
    assigneeTopLosersData = assigneeData.sort_values(by='Change').head(targetN)
    assigneeData = assigneeData.sort_values(by='Change',
                                            ascending=False).head(targetN)
    assigneeLosersChanges = zip(assigneeTopLosersData['Current Assignees'],
                                assigneeTopLosersData['Change'],
                                assigneeTopLosersData['PercentChange'])
    assigneeChanges = zip(assigneeData['Current Assignees'],
                          assigneeData['Change'],
                          assigneeData['PercentChange'])

    # Sector data preparation
    dfCopy = df.copy()
    dfCopy = dfCopy[dfCopy['Dates'].isin([date, previousDate])]
    dfCopy = pp.assignAssigneeSectorIndustry(dfCopy, assigneeSectorsIndustries,
                                             'uspto')
    dfCopy = dfCopy.dropna(subset=['Sectors'])
    dfCopy = dfCopy[['Dates', 'Sectors', 'ids']]
    sectors = list(set(dfCopy['Sectors'].tolist()))
    counts = dfCopy.groupby([
        'Dates', 'Sectors'
    ]).size().unstack(fill_value=0).stack().reset_index(name='nPPA')
    countBefore = counts['nPPA'].tolist()[:len(sectors)]
    countAfter = counts['nPPA'].tolist()[len(sectors):]
    differences = counts.groupby(['Sectors']).diff()[len(sectors):]
    totals = counts.groupby(['Sectors']).sum()
    sectorData = pd.DataFrame()
    sectorData['Sectors'] = sectors
    sectorData = sectorData.sort_values(by='Sectors')
    sectorData['Before'] = countBefore
    sectorData['After'] = countAfter
    sectorData['Total'] = totals['nPPA'].tolist()
    sectorData['Change'] = differences['nPPA'].tolist()
    sectorData[
        'PercentChange'] = sectorData['Change'] / sectorData['Total'] * 100
    sectorData = sectorData.sort_values(by='Change', ascending=False)
    sectorChanges = zip(sectorData['Sectors'], sectorData['Change'],
                        sectorData['PercentChange'])

    templateHTML = 'uspto/general_trends.html'
    mainHTML = render_to_string(
        templateHTML, {
            'date': date,
            'dates': dates,
            'yearData': yearData,
            'sectorData': sectorData,
            'sectorChanges': sectorChanges,
            'assigneeChanges': assigneeChanges,
            'assigneeLosersChanges': assigneeLosersChanges,
            'targetN': targetN,
        })
    return mainHTML

예제 #18

0

파일 보기

def new_data_setup(request):
    errors = []
    warnings = []
    form = None
    valid = False
    hasInputFile = False
    columnList = None
    dataSetName = None
    targetYearColumnName = None
    targetAssigneeColumnName = None
    sampleFileHTML = None
    inputFileHTML = None
    inFileName = 'newDataInput'
    inFolderName = '../in/'
    fileType = '.xlsx'

    # Submit File View Setup
    if (request.method == "POST" and request.POST.get('upload')):
        form = UploadFileForm(request.POST, request.FILES)
        inputFile = request.FILES['file']
        if form.is_valid():
            inputFile = request.FILES['file']
            targetSheetName = request.POST.get('target-sheet')
            inputFileDF = pd.read_excel(inputFile, targetSheetName)
            columnList = list(inputFileDF.columns)
            columnList.append('Not Applicable')
            inputFileDF.to_excel(inFolderName + inFileName + fileType,
                                 index=False)
            inputFileHTML = inputFileDF.head(50).to_html()
        try:
            if (inputFile):
                inputFile.close()
        except Exception:
            valid = False
            errors.append(
                "The process cannot access the input file because it is being used by another process."
            )
        hasInputFile = True

    # Step 2 View
    elif (request.method == "POST" and request.POST.get('finish')):
        dataSetName = request.POST.get('data-set-name')
        sourceName = request.POST.get('target-source')
        targetPublicationNumberColumnName = request.POST.get(
            'target-column-publication-number')
        targetAssigneeColumnName = request.POST.get('target-column-assignee')
        targetYearColumnName = request.POST.get('target-column-year')
        targetMainCPCColumnName = request.POST.get('target-main-cpc')
        targetCPCColumnName = request.POST.get('target-column-cpc')
        targetCategoryColumnName = request.POST.get('target-column-category')
        targetTitlesColumnName = request.POST.get('target-column-titles')
        targetAbstractsColumnName = request.POST.get('target-column-abstracts')
        targetIndependentClaimsColumnName = request.POST.get(
            'target-column-independent-claims')
        targetTechnicalConceptColumnName = request.POST.get(
            'target-column-technical-concepts')
        df = pd.read_excel(inFolderName + inFileName + fileType, 'Sheet1')
        if (targetPublicationNumberColumnName == 'Not Applicable'):
            df['PUBLICATION NUMBER'] = ''
        else:
            df = df.rename(index=str,
                           columns={
                               targetPublicationNumberColumnName:
                               "PUBLICATION NUMBER"
                           })
        if (targetAssigneeColumnName == 'Not Applicable'):
            df['CA'] = ''
        else:
            df = df.rename(index=str, columns={targetAssigneeColumnName: "CA"})
        if (targetYearColumnName == 'Not Applicable'):
            df['YEAR'] = ''
        else:
            df = df.rename(index=str, columns={targetYearColumnName: "YEAR"})
        if (targetMainCPCColumnName == 'Not Applicable'):
            df['MAIN CPC'] = ''
        else:
            df = df.rename(index=str,
                           columns={targetMainCPCColumnName: "MAIN CPC"})
        if (targetCPCColumnName == 'Not Applicable'):
            df['CPC'] = ''
        else:
            df = df.rename(index=str, columns={targetCPCColumnName: "CPC"})
        if (targetCategoryColumnName == 'Not Applicable'):
            df['CATEGORY'] = ''
        else:
            df = df.rename(index=str,
                           columns={targetCategoryColumnName: "CATEGORY"})
        if (targetTitlesColumnName == 'Not Applicable'):
            df['TITLES'] = ''
        else:
            df = df.rename(index=str,
                           columns={targetTitlesColumnName: "TITLES"})
        if (targetAbstractsColumnName == 'Not Applicable'):
            df['ABSTRACTS'] = ''
        else:
            df = df.rename(index=str,
                           columns={targetAbstractsColumnName: "ABSTRACTS"})
        if (targetIndependentClaimsColumnName == 'Not Applicable'):
            df['INDEPENDENT CLAIMS'] = ''
        else:
            df = df.rename(index=str,
                           columns={
                               targetIndependentClaimsColumnName:
                               "INDEPENDENT CLAIMS"
                           })
        if (targetTechnicalConceptColumnName == 'Not Applicable'):
            df['TECHNICAL CONCEPTS'] = ''
        else:
            df = df.rename(index=str,
                           columns={
                               targetTechnicalConceptColumnName:
                               "TECHNICAL CONCEPTS"
                           })
        # df['MAIN CPC DESCRIPTION'] = pp.getCPCDescriptions(df)
        # df['CPC DESCRIPTIONS'] = pp.getCPCListDescriptions(df)
        # temp = cc.removePublicationNumbers(df['TECHNICAL CONCEPTS'].tolist())
        # temp = cc.removeConceptPostfix(temp)
        # temp = cc.getTechnicalConcepts(temp)
        # df['TECHNICAL CONCEPTS'] = temp
        df['TYPE'] = cc.getDocumentTypes(df['PUBLICATION NUMBER'], 9)

        keywordsDF = dbq.getAllAssigneeKeywords()
        df = pp.assigneeGrouping(df, keywordsDF)
        # dbq.updateCleanCurrentAssignees(dataSetName, df)
        dbq.insertPatents(df, dataSetName, sourceName)
        hasInputFile = True
        valid = True

        dataSetNames = []
        datasets = Datasets.objects.all()
        for dataset in datasets:
            dataSetNames.append(dataset.name)
        dataSetNames.sort()

        templateHTML = 'data_sets/index.html'
        mainHTML = render_to_string(templateHTML, {
            'dataSetNames': dataSetNames,
        })
        return mainHTML

    # Default View
    else:
        form = UploadFileForm()
        sampleFileDF = pd.read_excel('../out/Small Sample File.xlsx', 'Sheet1')
        sampleFileHTML = sampleFileDF.head().to_html()

    templateHTML = 'data_sets/new_data_setup.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'valid': valid,
            'hasInputFile': hasInputFile,
            'errors': errors,
            'warnings': warnings,
            'columnList': columnList,
            'sampleFileHTML': sampleFileHTML,
            'inputFileHTML': inputFileHTML,
        })
    return mainHTML

예제 #19

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def assignee_statistics(request, data_set, classification, assignee):
    errors = []
    warnings = []
    form = None
    valid = False
    hasDataSet = False
    hasAssignee = False
    nPPA = None
    CPCLegend = None
    assigneeList = None
    selectedClass = ''
    selectedClassificationDisplay = ''
    maxYear = None
    categoryData = None
    yearData = None
    previousNYears = 20

    targetAssigneeColumnName = "CA"
    targetYearColumnName = "YEAR"
    targetCategoryColumnName = "Categories"

    # Data set selection view
    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    classificationNames = dbq.getClassificationList()
    classificationNames.insert(0, 'index')

    # Category selection view
    if (not data_set == 'index' and assignee == 'index'):
        # df = dbq.getDataSetPatents(data_set)
        df = pd.DataFrame()
        df = dbq.getDataSetPatentAssignees(data_set, df)
        df['Current Assignees'] = df['Clean Assignees'].tolist()
        CAs = df['Current Assignees']
        assigneeList = sorted(list(set(CAs.tolist())))
        assigneeList.insert(0, 'index')
        hasDataSet = True

    # Graph preparations
    elif (not data_set == 'index' and not assignee == 'index'):
        if (request.method == "POST"):
            previousNYears = int(request.POST.get('target-n-years'))
        # df = dbq.getDataSetPatents(data_set)
        df = pd.DataFrame()
        df = dbq.getDataSetPatentYears(data_set, df)
        df = dbq.getDataSetPatentTypes(data_set, df)
        df = dbq.getDataSetPatentAssignees(data_set, df)
        assigneeList = sorted(list(set(df['Current Assignees'].tolist())))
        df['Current Assignees'] = df['Clean Assignees'].tolist()
        df = dbq.getDataSetPatentColumn(data_set, df, classification)
        df.drop(df[df['Current Assignees'] != assignee].index, inplace=True)
        years = df['Years']
        maxYear = max(years)
        minYear = maxYear - previousNYears + 1
        df = df[df.Years >= minYear]
        years = df['Years']
        nPPA = len(df.index)
        selectedClass = df[classification].tolist()
        CAs = df['Current Assignees']
        types = df['Types']
        allClass = []
        allYears = []
        allCAs = []
        allTypes = []
        for year, CA, cList, patentType in zip(years, CAs, selectedClass,
                                               types):
            if (cList == cList and cList != None):
                # for c in ast.literal_eval(cList):
                for c in cList:
                    if (c and c == c and c != 'nan' and c != 'NAN'):
                        allClass.append(c.lstrip().rstrip())
                        allYears.append(int(year))
                        allCAs.append(str(CA).lower().lstrip().rstrip())
                        allTypes.append(patentType)
        expandedDF = pd.DataFrame()
        expandedDF[targetCategoryColumnName] = allClass
        expandedDF[targetYearColumnName] = allYears
        expandedDF[targetAssigneeColumnName] = allCAs
        expandedDF['Types'] = allTypes

        # Line Graph Data
        expandedDFCopy = expandedDF.copy()
        grouped = df.groupby(['Years'])
        groupSizes = df.groupby(['Years']).size()
        years = []
        sizes = []
        for g, s in zip(grouped, groupSizes):
            years.append(int(g[0]))
            sizes.append(s)
        sizesDF = pd.DataFrame()
        sizesDF['Year'] = years
        sizesDF['Count'] = sizes
        maxYear = int(sizesDF.iloc[sizesDF['Count'].argmax()]['Year'])

        # Bar Graph 1 Data
        uniqueCategories = []
        expandedDFCopy = expandedDF.copy()
        assigneeCounts = pd.crosstab(expandedDFCopy[targetCategoryColumnName],
                                     [expandedDFCopy['Types']],
                                     margins=True).sort_values(
                                         'All', ascending=False).reset_index()
        assigneeCounts = assigneeCounts.drop(['All'],
                                             axis=1).drop([0]).head(10)
        uniqueCategories = assigneeCounts[targetCategoryColumnName].tolist()
        categoryData = assigneeCounts.to_json(orient="index")

        # Bar Graph 2 Data
        # Year Bar Graph Data
        yearData = pd.crosstab(df['Years'], [df['Types']]).reset_index()
        yearData = yearData.rename(index=str, columns={'Years': "Categories"})
        yearData = yearData.to_json(orient="index")

        if (classification == 'cpc'):
            CPCLegend = zip(uniqueCategories,
                            pp.getCPCDescription(uniqueCategories))

        hasDataSet = True
        hasAssignee = True
        valid = True

    templateHTML = 'visualization/assignee_statistics.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'hasDataSet': hasDataSet,
            'hasAssignee': hasAssignee,
            'classification': classification,
            'selectedClassificationDisplay': selectedClassificationDisplay,
            'valid': valid,
            'errors': errors,
            'warnings': warnings,
            'nPPA': nPPA,
            'CPCLegend': CPCLegend,
            'dataSetNames': dataSetNames,
            'classificationNames': classificationNames,
            'assigneeList': assigneeList,
            'data_set': data_set,
            'assignee': assignee,
            'maxYear': maxYear,
            'categoryData': categoryData,
            'yearData': yearData,
            'previousNYears': previousNYears,
        })
    return mainHTML

예제 #20

0

파일 보기

# -*- coding: utf-8 -*-
import DBQueries

##########################
# Collect dataframe DBs  #
##########################
dbs = DBQueries.dictDB()

dbs = dbs.sort_values(['DB', 'Date'], ascending=[True, False])
dbs = dbs.groupby('DB').first().reset_index()

with open('data/current.ini', 'w') as the_file:
    the_file.write('[DBs]\n')
    the_file.write('number={}\n'.format(len(dbs)))
    the_file.write('[first_edition]\n')
    the_file.write('host=database\n')
    the_file.write('database=enzo\n')
    the_file.write('user=postgres\n')
    the_file.write('port=5432\n')

    for index, row in dbs.iterrows():
        the_file.write('[{}]\n'.format(row['DB']))
        the_file.write('host=10.9.0.50\n')
        the_file.write('database={}\n'.format(row['Hash']))
        the_file.write('user=postgres\n')
        the_file.write('port=5432\n')
        the_file.write('password=xxxxxx\n')
    the_file.write('[ElasticSearch]\n')
    # the_file.write('server=elk.enzo.net\n')
    the_file.write('server=10.9.0.200\n')
    the_file.write('index_name=first_edition*\n')

예제 #21

0

파일 보기

파일: views.py 프로젝트: jraramirez/patent-analytics-tool

def word_cluster_map(request, data_set, column):
    errors = []
    warnings = []
    form = None
    valid = False
    hasDataSet = False
    clusterData = None
    dataSetNames = []
    selectedClassificationDisplay = ''

    minNodeSize = 99999
    maxNodeSize = 0
    minEdgeWeight = 99999
    maxEdgeWeight = 0
    minNodeSizeWithLabel = 20
    maxNNodes = 30
    previousNYears = 20
    topN = 10

    dataSetNames = []
    datasets = Datasets.objects.all()
    for dataset in datasets:
        dataSetNames.append(dataset.name)
    dataSetNames.insert(0, 'index')

    columnNames = ['titles', 'abstracts', 'independent_claims']
    columnNames.insert(0, 'index')

    # Model setup view
    if (not (data_set == 'index' or column == 'index')):
        if (request.method == "POST"):
            maxNNodes = int(request.POST.get('target-n-nodes'))
            previousNYears = int(request.POST.get('target-n-years'))
        # if(len(df.index)>1000):
        #     df = df.sample(n=500, replace=False, random_state=17)]
        df = pd.DataFrame()
        df = dbq.getDataSetPatentTACs(data_set, df)
        df = dbq.getDataSetPatentYears(data_set, df)
        years = df['Years']
        maxYear = max(years)
        minYear = maxYear - previousNYears + 1
        df = df[df.Years >= minYear]

        wordList = []
        f = open('../out/words.txt', 'r')
        for line in f:
            wordList.append(line.rstrip())
        f.close()
        columnWords = []
        if (column == 'titles'):
            columnWords = pp.normalizeCorpus(df['Titles'].tolist(), wordList)
        elif (column == 'abstracts'):
            columnWords = pp.normalizeCorpus(df['Abstracts'].tolist(),
                                             wordList)
        elif (column == 'independent_claims'):
            columnWords = pp.normalizeCorpus(df['Independent Claims'].tolist(),
                                             wordList)
        selectedColumn = columnWords

        uniqueWords = []
        combinedWords = []

        allWords = []
        for wordList in selectedColumn:
            if (wordList == wordList and wordList != None):
                for word in wordList:
                    if (word != 'nan' and word != '' and word != 'NAN'
                            and word != 'Nan'):
                        allWords.append(word)
        expandedDF = pd.DataFrame()
        expandedDF[column] = allWords
        uniqueWords = list(set(allWords))

        wordSizes = expandedDF.groupby([column
                                        ]).size().reset_index(name='nPPA')
        topNWords = wordSizes.nlargest(topN, 'nPPA')[column].tolist()
        wordList = wordSizes[column].tolist()
        wordSizesList = wordSizes['nPPA'].tolist()

        # Cleaning of CPC
        relationships = selectedColumn
        relationshipsEval = []

        if (maxNNodes > 0):
            topNNodes = v.getTopNNodes(relationships, maxNNodes)
            for rList in relationships:
                tempRList = []
                for node in list(filter(lambda a: a != '', rList)):
                    if (node in topNNodes):
                        tempRList.append(node)
                relationshipsEval.append(tempRList)
        else:
            for rList in relationships:
                relationshipsEval.append(list(filter(lambda a: a != '',
                                                     rList)))
        source = []
        target = []
        weight = []
        for r in relationshipsEval:
            pairs = combinations(r, 2)
            for p in pairs:
                source.append(p[0])
                target.append(p[1])
                weight.append(1)

        newDF = pd.DataFrame()
        newDF['source'] = source
        newDF['target'] = target
        newDF['weight'] = weight

        graphDF = newDF.groupby(['source', 'target']).sum().reset_index()
        maxEdgeWeight = graphDF['weight'].max()
        minEdgeWeight = graphDF['weight'].min()
        # graphDF.to_excel(outFolderName + 'edgelist.xlsx')
        G = nx.from_pandas_edgelist(graphDF, 'source', 'target', 'weight')
        G2 = nx.convert_node_labels_to_integers(G, label_attribute='name')

        # Determine node groups using Louvain modularity
        communities = best_partition(G2, weight='size')
        d = nx.readwrite.json_graph.node_link_data(G2, {'name': 'index'})
        nodeNames = []
        nodeCommunities = []
        nodeSizes = []
        nodeTop10 = []
        for node in d['nodes']:
            name = node['name']
            # size = G2.degree[list(G.nodes()).index(node['name'])]
            size = wordSizesList[wordList.index(node['name'])]
            community = communities[list(G.nodes()).index(node['name'])]
            node['size'] = size
            node['group'] = community
            nodeNames.append(name)
            nodeSizes.append(size)
            nodeCommunities.append(community)
            name = node['name']
            if (node['size'] < minNodeSize):
                minNodeSize = node['size']
            if (node['size'] > maxNodeSize):
                maxNodeSize = node['size']
        # minNodeSizeWithLabel = 0.2 * maxNodeSize
        # for node in d['nodes']:
        #     if(node['size'] < minNodeSizeWithLabel):
        #         node['name'] = None
        for node in d['nodes']:
            if (not node['name'] in topNWords):
                node['fontSize'] = 8
                node['opacity'] = 0.5
            else:
                node['fontSize'] = node['size']
                node['opacity'] = 1

        nodesDF = pd.DataFrame()
        nodesDF['CPC'] = nodeNames
        nodesDF['Size'] = nodeSizes
        nodesDF['Community'] = nodeCommunities

        del d["directed"]
        del d["multigraph"]
        del d["graph"]
        clusterData = d
        hasDataSet = True
        valid = True

    templateHTML = 'visualization/word_cluster_map.html'
    mainHTML = render_to_string(
        templateHTML, {
            'form': form,
            'valid': valid,
            'errors': errors,
            'warnings': warnings,
            'data_set': data_set,
            'column': column,
            'columnNames': columnNames,
            'hasDataSet': hasDataSet,
            'dataSetNames': dataSetNames,
            'minNodeSize': minNodeSize,
            'maxNodeSize': maxNodeSize,
            'maxEdgeWeight': maxEdgeWeight,
            'minEdgeWeight': minEdgeWeight,
            'clusterData': clusterData,
            'maxNNodes': maxNNodes,
            'previousNYears': previousNYears,
        })
    return mainHTML

예제 #22

0

파일 보기

import re
import HTMLGenerator
import logging
import DBQueries as DBQ
import QueryExecution as Q
queryObj=DBQ.DBQuery_Extractor()
QueryDict={}
QueryDict=queryObj.fetch_Query('Student_monthly_report.sql')

identifierList=QueryDict.keys()
identifierList.sort()
TaskQueries=[]
for i in identifierList:
        if re.search("TASK",i):
            TaskQueries.append(i)
        else:
            Q.DBQueryExecution(QueryDict[i])

monthdict={
'January':"'%-01-%'",
'February':"'%-02-%'",
'March':"'%-03-%'",
'April':"'%-04-%'",
'May':"'%-05-%'",
'June':"'%-06-%'",
'July':"'%-07-%'",
'August':"'%-08-%'",
'September':"'%-09-%'",
'October':"'%-10-%'",
'November':"'%-11-%'",
'December':"'%-12-%'"