Ejemplo n.º 1
0
def findParents(TermID, parentHits, secondaryHits, checked):
	if TermID not in checked:
		checked.append(TermID)
		if str(TermID) in primary_parents_list:
			if str(TermID) not in parentHits:
				parentHits.append(TermID)
		else:	
			query = "SELECT DISTINCT `ParentTermID`, `ParentTermLabel` FROM `thesaurusNew2015` WHERE TermID = %s"
			cursor = db.get_cursor()	
			cursor.execute(query, [TermID])
			set  = {}
			while True: 
				row = cursor.fetchone()
				if not row:
					break
				if row:
					set[str(row["ParentTermID"])] = row["ParentTermLabel"]
                        db.get_connection().commit()
			cursor.close()

			for ParentTermID in set:
				if str(ParentTermID) in primary_parents_list:
					if str(ParentTermID) not in parentHits:
						parentHits.append(ParentTermID)
					if str(TermID) not in secondaryHits:
						secondaryHits.append(TermID)
				else:
					parentHits, secondaryHits = findParents(ParentTermID, parentHits, secondaryHits, checked)
	return parentHits, secondaryHits
Ejemplo n.º 2
0
def getBaseProbabilities(level, database):
	base_probabilities = {}
	if database:
		level = level[0].upper() + level[1:]	
	tablename = database + level + 'LevelBaseProbabilities2015'
	query = 'SELECT *  FROM {tablename}'.format(tablename = tablename)
	cursor = db.get_cursor()
	cursor.execute(query)
	count = 0
	while True: 
		row = cursor.fetchone()
		if not row: 
			break
		else:
			if level.lower() == 'first':
                		if count <= 100: 
                        		for row_val in row:
                                		if row_val not in base_probabilities:
                                        		base_probabilities[row_val] = {}
                                		base_probabilities[row_val][count+1] = {}
                                		base_probabilities[row_val][count+1] = row[row_val]
				
			if level.lower() == 'second':
                		parent = "L" + str(row["ParentID"])
                		if parent not in base_probabilities:
                        		base_probabilities[parent] = {}
                		for index in range(1,101):
                        		base_probabilities[parent][index] = {}
                        		base_probabilities[parent][index] = row["L" + str(index)]					
		count += 1 
	newtablename = '{tablename}_extended'.format(tablename = tablename)
	return base_probabilities, newtablename
Ejemplo n.º 3
0
def insertToNewTable(baseProbabilities, parents, tablename):
	columnList = ['ParentID', 'ParentLabel']
        for num in range(1,101):
                columnList.append('L' + str(num))
        columnsString = ', '.join(columnList)
	
	queryTemplate = 'INSERT INTO {tablename} ({columns}) VALUES ({values})'
	cursor = db.get_cursor()
	for parent in baseProbabilities:
		if parent[1:] in parents:
			baseProbabilities[parent]["ParentLabel"] = parents[parent[1:]]
		
			values = baseProbabilities[parent]
			string = values["ParentLabel"].encode('ascii', 'ignore')
			string = re.sub("'", "", string)
			label  = "'{label}'".format(label = string)
			valueList = []
			valueList.append(parent[1:])
			valueList.append(label)
			for num in range(1,101):
				valueList.append(str(values[num]))
			valuesString  = ', '.join(valueList)
			query = queryTemplate.format(tablename = tablename, columns = columnsString, values = valuesString)
			print(query)
			cursor.execute(query)
	cursor.close()
Ejemplo n.º 4
0
def createNewTable(level, database, tablename):
	query = 'CREATE TABLE IF NOT EXISTS {tablename}'.format(tablename = tablename) 	
	columnList = ['ParentID INT', 'ParentLabel VARCHAR(200)']
	for num in range(1,101):
		columnList.append('L' + str(num) + ' FLOAT')
	columnsString = ', '.join(columnList)
	query = query + ' ( ' + columnsString + ' ) '
	cursor = db.get_cursor()
	cursor.execute(query)
	cursor.close()	
Ejemplo n.º 5
0
def getParents(level):
	parents = {}
	tablename = level + 'LevelParents2015'
	query = 'SELECT * from {tablename}'.format(tablename = tablename)
	cursor = db.get_cursor()
	cursor.execute(query)
	while True: 
		row = cursor.fetchone()
		if not row:
			break
		else:
			parents[str(row["TermID"])] = row["TermLabel"] 
	return parents	
from __future__ import division
import common.db as db
import json
import csv
from math import floor, ceil

# This is for only ShortFormID = 2, Even Testimonies and Pruned with SaturationScore >= 0.20
# And now, further pruned with TotSegment >= 40.
query1 = """
SELECT * 
FROM testimoniesSaturationPruned2015 
"""

cursor1 = db.get_cursor()
cursor1.execute(query1)
testimony_details_pruned = {}
i = 0
while True:
    row = cursor1.fetchone()
    if not row:
        break
    else:
        testimony_details_pruned[row["IntCode"]] = {
            "IntCode": row["IntCode"],
            "IntervieweeName": row["IntervieweeName"],
            "TotTaggedSegments": row["TotTaggedSegments"],
            "TotSegments": row["TotSegments"],
            "SaturationPercent": row["SaturationPercent"],
            "NormativityScore": 0
        }
    i = i + 1
Ejemplo n.º 7
0
from __future__ import division
import common.db as db
import json
import xmltodict
from math import floor, ceil

# Extracting the First Level Parents: 
# Important Note!: primary node has label 32582 NOT -1 in thesaurusNew2015

query = """
SELECT DISTINCT  `TermID` ,  `TermLabel` 
FROM  `thesaurusNew2015` 
WHERE  `ParentTermID` = 32582
"""

cursor = db.get_cursor()
cursor.execute(query)
primary_parents = {}
primary_parents_list = []
while True:
        row = cursor.fetchone()
        if not row:
                break
        if row:
                primary_parents[row["TermID"]] = row["TermLabel"]
                primary_parents_list.append(str(row["TermID"]))
cursor.close()
print(len(primary_parents))

          
         
Ejemplo n.º 8
0
import common.db as db

cursor = db.get_cursor()
print "Resetting table ..."
cursor.execute("""DROP TABLE IF EXISTS firstLevelBaseProbabilities""")
segmentile_columns = ['L{}'.format(x) for x in range(1, 101)]
cursor.execute(
    """CREATE TABLE firstLevelBaseProbabilities (TermID INT, KeywordLabel VARCHAR(100), ShortFormID INT, {} FLOAT)"""
    .format(' FLOAT, '.join(segmentile_columns)))
print "Table reset"

print "Loading data ..."
query = """
SELECT 
    totalKeywordsBySegment.ShortFormID, 
    totalKeywordsBySegment.segmentile, 
    FirstLevelParentID, 
    keywordCountsBySegment.keywordCount, 
    keywordCountsBySegment.keywordCount / totalKeywordsBySegment.totalKeywords AS probability

FROM (
    -- Total keywords by segment and ShortFormID
    SELECT t.ShortFormID, ROUND(SegmentNumber / totalSegments.total * 100) AS segmentile, COUNT(*) AS totalKeywords
    FROM segmentKeywordsFirstParents2015Filled AS sk
    JOIN (
    	SELECT sk.IntCode, MAX(SegmentNumber) AS total
        FROM segmentKeywordsFirstParents2015Filled AS sk
	    GROUP BY sk.IntCode
	) AS totalSegments ON totalSegments.IntCode = sk.IntCode
    JOIN testimonies2015_include AS t ON t.IntCode = sk.IntCode
    GROUP BY t.ShortFormID, segmentile
import common.db as db
import csv
import json
from __future__ import division
from math import floor, ceil

# Getting all the unique parents first. And then creating the parents table 
# First, processing only the first level parents.
query1 = """
SELECT TermID
FROM  `secondLevelParents2015` 
"""
cursor1 = db.get_cursor()
cursor1.execute(query1)
parentsTable = {}
while True:
	row=cursor1.fetchone()
	if not row:
		break
	else:
		print row["TermID"]
		i = 1
		parentsTable[row["TermID"]] = {}
		for i  in range(1,101):
			parentsTable[row["TermID"]][i] = 0
cursor1.close()

# All the parent values have now been initialized.
query1 = """
SELECT * 
FROM testimoniesSaturationPruned2015 
import common.db as db

conn = db.get_connection()
cursor = db.get_cursor()

cursor.execute('''DROP TABLE IF EXISTS firstLevelBaseProbabilitiesUnrolled''')
cursor.execute(
    '''CREATE TABLE firstLevelBaseProbabilitiesUnrolled (TermID INT, ShortFormID INT, Percentile INT, Probability FLOAT)'''
)
cursor.execute('''DROP TABLE IF EXISTS secondLevelBaseProbabilitiesUnrolled''')
cursor.execute(
    '''CREATE TABLE secondLevelBaseProbabilitiesUnrolled (TermID INT, ShortFormID INT, Percentile INT, Probability FLOAT)'''
)

read_cursor = db.get_cursor()
write_cursor = db.get_cursor()

read_cursor.execute('''SELECT * FROM firstLevelBaseProbabilities ''')
while True:
    row = read_cursor.fetchone()
    if not row:
        break
    term_ID = row['TermID']
    short_form_ID = row['ShortFormID']
    for segmentile in range(1, 101):
        key = 'L' + str(segmentile)
        write_cursor.execute(
            '''INSERT INTO firstLevelBaseProbabilitiesUnrolled (TermID, ShortFormID, Percentile, Probability) VALUES (%s, %s, %s, %s) ''',
            [
                term_ID,
                short_form_ID,
# This is to calculate the FIRST LEVEL parents based uniqueness scores for all
# the of testimonies fed from the chosen table and write them to file. 

import common.db as db
import csv
import json
from __future__ import division
from math import floor, ceil

# Read first level parent base probabilities.
query1 = """
SELECT * 
FROM firstLevelBaseProbabilities2015
"""
cursor1 = db.get_cursor()
cursor1.execute(query1)
base_probabilities = {} 
while True:
        row = cursor1.fetchone()
        if not row:
                break
        else:
                parent = "L" + str(row["ParentID"])
                if parent not in base_probabilities:
                        base_probabilities[parent] = {}
                for index in range(1,101):
                        base_probabilities[parent][index] = {}
                        base_probabilities[parent][index] = row["L" + str(index)]
cursor1.close()

from __future__ import division
import common.db as db
import json
import csv
from math import floor, ceil

# This is for only ShortFormID = 2, Even Testimonies and Pruned with SaturationScore >= 0.20
# And now, further pruned with TotSegment >= 40. 
query1 = """
SELECT * 
FROM testimoniesSaturationPruned2015 
"""

cursor1 = db.get_cursor()
cursor1.execute(query1)
testimony_details_pruned = {}
i=0
while True:
        row=cursor1.fetchone()
        if not row:
                break
        else:
                testimony_details_pruned[row["IntCode"]]={
                        "IntCode" : row["IntCode"],
                        "IntervieweeName" : row["IntervieweeName"],
                        "TotTaggedSegments" : row["TotTaggedSegments"],
                        "TotSegments" : row["TotSegments"],
                        "SaturationPercent" : row["SaturationPercent"],
                        "NormativityScore" : 0
                }
        i=i+1
Ejemplo n.º 13
0
# Opening all files in order:
from os import walk
allfiles = []
mypath = args.thesaurus
for (dirpath, dirnames, filenames) in walk(mypath):
    allfiles.extend(filenames)
    break

insert_query3 = """
INSERT INTO thesaurus2015
(TermID, TermLabel, ParentTermID, ParentTermLabel, ThesaurusID, ThesaurusType, KeywordCount, TypeID, IsParent, KWDefinition)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

count = 1
cursor = db.get_cursor()

for file in allfiles:
    if True:
        try:
            filename = args.thesaurus + file
            myfile = open(filename, 'r')
            data = myfile.read()
            udata = unicode(data, 'utf-8')
            encoded_udata = udata.encode('utf-8-sig')
            decoded_udata = encoded_udata.decode('utf-8-sig')
            ordered_dict = xmltodict.parse(decoded_udata)
            pydict = dict(ordered_dict)
            items = pydict["USCShoahFoundationInstituteData"]["ThesaurusData"][
                "Items"]
            l = len(items["Item"])
import common.db as db

conn = db.get_connection()
cursor = db.get_cursor()

print 'Clearing database ...'
cursor.execute('''DROP TABLE IF EXISTS uniqueness_scores_first_level ''')
cursor.execute('''DROP TABLE IF EXISTS uniqueness_scores_second_level ''')
cursor.execute(
    '''CREATE TABLE uniqueness_scores_first_level  (IntCode INT, Score FLOAT)'''
)
cursor.execute(
    '''CREATE TABLE uniqueness_scores_second_level  (IntCode INT, Score FLOAT)'''
)

max_segments = {}
print 'Finding maximum segments ...'
cursor.execute(
    '''SELECT skf.IntCode, MAX(skf.SegmentNumber) AS max_seg FROM segmentKeywordsFirstParents2015Filled skf 
        JOIN testimonies2015_include AS ti ON ti.IntCode = skf.IntCode
        GROUP BY IntCode
''')
while True:
    row = cursor.fetchone()
    if not row:
        break
    max_segments[row['IntCode']] = row['max_seg']

print 'Running first level normativities ...'
cursor.execute('''SELECT * FROM firstLevelBaseProbabilities ''')
first_level_base_probabilities = {}
from os import walk
allfiles = []
mypath = args.thesaurus
for (dirpath, dirnames, filenames) in walk(mypath):
    allfiles.extend(filenames)
    break


insert_query3 = """
INSERT INTO thesaurus2015
(TermID, TermLabel, ParentTermID, ParentTermLabel, ThesaurusID, ThesaurusType, KeywordCount, TypeID, IsParent, KWDefinition)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

count = 1
cursor = db.get_cursor()

for file in allfiles: 
	if True: 
		try:
			filename = args.thesaurus + file
			myfile = open(filename, 'r')
			data = myfile.read()
			udata = unicode(data, 'utf-8')
			encoded_udata = udata.encode('utf-8-sig')
			decoded_udata = encoded_udata.decode('utf-8-sig')
			ordered_dict = xmltodict.parse(decoded_udata)
			pydict = dict(ordered_dict)	
			items = pydict["USCShoahFoundationInstituteData"]["ThesaurusData"]["Items"]
			l = len(items["Item"])
			for no in range(0,l):
# This code:
# First, the all of the testimonies and their details are collectedi from the EVEN segments testimonies table.
# Second, the saturation scores and the unique_tags are gotten from the segnment-keywords table.
# Third,
from __future__ import division
import common.db as db
import json

query = """
SELECT * from evenSegmentTestimonies2015
"""
cursor = db.get_cursor()
cursor.execute(query)
evenTestimoniesList = []
while True:
    row = cursor.fetchone()
    if not row:
        break
    if row:
        evenTestimoniesList.append(row["IntCode"])
cursor.close()
refinedEvenTestimoniesList = ', '.join(a for a in evenTestimoniesList)

query = """
SELECT ES.IntCode AS IntCode, MAX( ES.SegmentNumber ) AS TotSegments, CS.ShortFormID AS ShortFormID
FROM segmentKeywordsTable2015 AS ES, testimonies2015 AS CS
WHERE ES.IntCode = CS.IntCode
AND CS.ShortFormID = 2
AND ES.IntCode in ({0})
GROUP BY ES.IntCode
"""
# This code is to generate a new thesaurus (TWO PASS Code): 
# 	- Match the thesaurus keywords and the segment keywords by the keyword text.
# 	- and reassign keyword ids!

import common.db as db 
import re

query = """
SELECT *
FROM thesaurus2015
"""

# Pass One: To get new unique IDs to each of teh terms matched by text. 
cursor = db.get_cursor()
cursor.execute(query)
newThesaurus = {}
backMap = {}
wordCount = 1
while True: 
	row = cursor.fetchone()
	if not row: 
		break
	if row:
		splice = row["TermLabel"].strip() 
		if splice not in newThesaurus: 
			newThesaurus[splice] = {
				"TermLabel": row["TermLabel"].strip(),
				"ThesaurusID": row["ThesaurusID"],
				"ThesaurusType": row["ThesaurusType"],
				"KeywordCount": row["KeywordCount"],
Ejemplo n.º 18
0
from common import db

# find testimonies with > 40% saturation
#
# find all testimonies

table_name = 'segmentKeywordsSecondParents2015filled'

print "Clearing table ..."
create_table_cursor = db.get_cursor()
create_table_cursor.execute(
    '''DROP TABLE IF EXISTS {table_name}'''.format(**{
        'table_name': table_name,
    }))
create_table_cursor.execute('''CREATE TABLE {table_name}
    (
        SegmentNumber INT,
        SegmentID INT,
        TermID INT,
        TermLabel VARCHAR(255),
        SecondLevelParentID INT,
        ParentLabel VARCHAR(255),
        IntCode INT
    )
    '''.format(**{
    'table_name': table_name,
}))
print 'Table cleared'

get_testimony_segments = """
    SELECT SegmentNumber, SegmentID, TermID, TermLabel, SecondLevelParentID, ParentLabel, IntCode
# This code: 
# First, the all of the testimonies and their details are collectedi from the EVEN segments testimonies table. 
# Second, the saturation scores and the unique_tags are gotten from the segnment-keywords table. 
# Third, 
from __future__ import division
import common.db as db
import json

query = """
SELECT * from evenSegmentTestimonies2015
"""
cursor = db.get_cursor()
cursor.execute(query)
evenTestimoniesList = []
while True: 
	row = cursor.fetchone()
	if not row:
		break
	if row: 
		evenTestimoniesList.append(row["IntCode"])
cursor.close()
refinedEvenTestimoniesList = ', '.join(a for a in evenTestimoniesList)

query = """
SELECT ES.IntCode AS IntCode, MAX( ES.SegmentNumber ) AS TotSegments, CS.ShortFormID AS ShortFormID
FROM segmentKeywordsTable2015 AS ES, testimonies2015 AS CS
WHERE ES.IntCode = CS.IntCode
AND CS.ShortFormID = 2
AND ES.IntCode in ({0})
GROUP BY ES.IntCode
"""