def ProcessSingleHeadword(word, lines, idx):
    if (idx > 0):
        pathOut = 'E:/FULLTEXT/LEXICO/OUTPUT/' + word + '_' + str(idx) + '.txt'
    else:
        pathOut = 'E:/FULLTEXT/LEXICO/OUTPUT/' + word + '.txt'

    writeListToFile(lines, pathOut)
def processJSONDirectory(dataDir, logDir):

    logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir)

    print('log path', logPath)

    logData = []

    dateStamp = getDateStamp()
    message = 'Started processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    dataFileList = os.listdir(dataDir)

    #print(dataFileList)

    for dataFile in dataFileList:
        logData += processSingleFile(dataFile, dataDir)

    dateStamp = getDateStamp()
    message = 'Finished processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    writeListToFile(logData, logPath)
Esempio n. 3
0
def RunCode(START_NUMBER, session, headers):
    PATH_IN = "E:/FULLTEXT/SPECIALTY/NLTK_Words_List.txt"
    PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW"
    PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG"
    STOP_NUMBER = START_NUMBER + 100

    pathDataOut, pathStatusOut = sysHand.getIncrementPath(
        START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)

    wordList = sysHand.getWordFromTextFile(PATH_IN)

    results = []
    status = []

    for i in range(START_NUMBER, STOP_NUMBER):
        word = wordList[i]
        (data, message) = getSingleWord(word, session, headers)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        if (data):
            results.append(data)
        time.sleep(3)

    sysHand.writeDataToJSON(results, pathDataOut)
    sysHand.writeListToFile(status, pathStatusOut)
Esempio n. 4
0
def ProcessSingleHeadword(word, lines, idx, dirOut):
    if (idx > 0):
        pathOut = dirOut + '/' + word + '_' + str(idx) + '.txt'
    else:
        pathOut = dirOut + '/' + word + '.txt'

    writeListToFile(lines, pathOut)
def processText(pathIn, dirOut):
	pathOut = sysHandle.getRawPath(pathIn, dirOut)
	words = sysHandle.getWordFromTextFile(pathIn)
	cleanList = trim_and_sort(words)
	sysHandle.writeListToFile(cleanList, pathOut)
	sysHandle.openDir(dirOut)
	sys.exit()
def processText(pathIn, dirOut, dirLog):

    #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog)
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    filePrefix = "Extract_Sentence_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)
    #print('pathLog:', pathLog)

    #print(pathIn)
    #print(pathOut)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to extract sentences at " + dateStamp)

    #print('dateStamp:', dateStamp)

    extractSentences(pathIn, pathOut)
    #print(cleanList)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence extracting completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Esempio n. 7
0
def runDownLoad(START_NUMBER, proxies, headers, mode, location):

    PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt"

    DIR_DATA_OUT = ''
    DIR_LOG_OUT = ''

    print('Path In:', PATH_IN)
    #For Home only

    if (mode == "local"):
        DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML"
        DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG"

    elif (mode == "remote"):
        if (location == "home"):
            DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"
        elif (location == "office"):
            DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"

    print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT)

    STOP_NUMBER = START_NUMBER + 10

    print('starting at:', START_NUMBER)
    print('using agent:', headers['User-Agent'])

    #NOTE: LOG IS FOR EVERY BATCH
    #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)
    pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT)

    wordList = sysHand.getLineFromTextFile(PATH_IN)

    #results = []
    status = []
    dateStamp = sysHand.getDateStamp()
    status.append('Starting scraping Lexico at  ' + dateStamp)
    status.append('Starting scraping at index ' + str(START_NUMBER))
    status.append('Starting scraping using IP ' + proxies['http'])
    status.append('Starting scraping using agent ' + headers['User-Agent'])

    for i in range(START_NUMBER, STOP_NUMBER):
        pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT)
        word = wordList[i]
        (htmlData, message) = getSingleWord(word, proxies, headers)
        if (htmlData):
            with open(pathDataOut, "w", encoding='utf-8') as file:
                file.write(htmlData)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        time.sleep(7)

    #sysHand.writeDataToJSON(results, pathDataOut)
    dateStamp = sysHand.getDateStamp()
    status.append('Ending scraping Lexico at ' + dateStamp)
    sysHand.writeListToFile(status, pathStatusOut)
Esempio n. 8
0
def RunCode(START_NUMBER, proxies, headers, mode, location):

    PATH_IN = "E:/FULLTEXT/DICTIONARY/SPECIALTY/NLTK_Words_List.txt"
    PATH_DATA_OUT = ''
    PATH_LOG_OUT = ''

    print('Path In:', PATH_IN)
    #For Home only

    if (mode == "local"):
        PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW"
        PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG"

    elif (mode == "remote"):
        if (location == "home"):
            PATH_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW"
            PATH_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG"
        elif (location == "office"):
            PATH_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW"
            PATH_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG"

    print('\nData Path:', PATH_DATA_OUT, '\nLog Path:', PATH_LOG_OUT)

    STOP_NUMBER = START_NUMBER + 100

    print('starting at:', START_NUMBER)
    print('using agent:', headers['User-Agent'])
    urlTest = "http://icanhazip.com"
    resTest = requests.get(urlTest, proxies=proxies, headers=headers)
    print('using IP:', resTest.text)

    pathDataOut, pathStatusOut = sysHand.getIncrementPath(
        START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)

    wordList = sysHand.getWordFromTextFile(PATH_IN)

    results = []
    status = []
    dateStamp = sysHand.getDateStamp()
    status.append('Starting scraping Google at  ' + dateStamp)
    status.append('Starting scraping at index ' + str(START_NUMBER))
    status.append('Starting scraping using IP ' + resTest.text)
    status.append('Starting scraping using agent ' + headers['User-Agent'])

    for i in range(START_NUMBER, STOP_NUMBER):
        word = wordList[i]
        (data, message) = getSingleWord(word, proxies, headers)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        if (data):
            results.append(data)
        time.sleep(3)

    sysHand.writeDataToJSON(results, pathDataOut)
    dateStamp = sysHand.getDateStamp()
    status.append('Ending scraping Google at ' + dateStamp)
    sysHand.writeListToFile(status, pathStatusOut)
Esempio n. 9
0
def processHTML(fileName, dirIn, dirOut):
    fileOut = fileName.replace(".html", ".txt")
    pathIn = os.path.join(dirIn, fileName)
    pathOut = os.path.join(dirOut, fileOut)
    #print('\npathIn:', pathIn, '\npathOut:', pathOut)
    wordData = []
    with open(pathIn, "r", encoding="utf-8") as file:
        contents = file.read()
        wordData = processLexico(contents)

    writeListToFile(wordData, pathOut)
Esempio n. 10
0
def getProxyList(targetURL, pathOut):
    http = urllib3.PoolManager()
    try:
        response = http.request('GET', targetURL)
        print('Getting data from', targetURL, '...')
        if response.status == 200:
            data = response.data.decode('utf-8')
            proxList = str(data).split('\n')
            writeListToFile(proxList, pathOut)

    except Exception as e:
        print(e)
Esempio n. 11
0
def getZERO_to_ING_forms(wordForms, dictForms, outputDir):
    #get pairs like  CALF => CALVES

    pathOut = outputDir + "/ZERO_to_ING_Match_Pairs.txt"

    wordPairs = []
    for dictForm in dictForms:
        inflection = dictForm + 'ing'
        if (inflection in wordForms):
            wordPairs.append(dictForm + ', ' + inflection)
            print('finding...', dictForm, ', ', inflection)
    writeListToFile(wordPairs, pathOut)
Esempio n. 12
0
def getZ_to_ES_forms(wordForms, dictForms, outputDir):
    #get pairs like  CALF => CALVES

    pathOut = outputDir + "/Z_to_ES_Match_Pairs.txt"

    wordPairs = []
    for dictForm in dictForms:
        inflection = dictForm + 'es'
        if (dictForm.endswith('z') and inflection in wordForms):
            wordPairs.append(dictForm + ', ' + inflection)
            print('finding...', dictForm, ', ', inflection)
    writeListToFile(wordPairs, pathOut)
Esempio n. 13
0
def get_T_to_TTING_forms(wordForms, dictForms, outputDir):
    #get pairs like  LIFE => LIVES

    pathOut = outputDir + "/T_To_TTING_Match_Pairs.txt"

    wordPairs = []
    for dictForm in dictForms:
        inflection = dictForm + 'ting'
        if (dictForm.endswith('t') and inflection in wordForms):
            wordPairs.append(dictForm + ', ' + inflection)
            print('finding...', dictForm, ', ', inflection)
    writeListToFile(wordPairs, pathOut)
Esempio n. 14
0
def get_IE_to_YNG_forms(wordForms, dictForms, outputDir):
    #get pairs like  LIFE => LIVES

    pathOut = outputDir + "/IE_To_YNG_Match_Pairs.txt"

    wordPairs = []
    for dictForm in dictForms:
        if (len(dictForm) > 2):
            inflection = dictForm[:-2] + 'yng'
            if (dictForm.endswith('ie') and inflection in wordForms):
                wordPairs.append(dictForm + ', ' + inflection)
                print('finding...', dictForm, ', ', inflection)
    writeListToFile(wordPairs, pathOut)
Esempio n. 15
0
def processText(inFile, outDir, dbDir):
    pathRecycleOut = sysHandle.getRawPath(inFile, outDir)
    pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir)
    trashListIn = sysHandle.getWordFromTextFile(inFile)
    databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn)
    recycleListOut = [
        item for item in trashListIn if item not in databseListIn
    ]
    standardList = getWordList()
    newRecycle = [item for item in recycleListOut if item not in standardList]
    #print (newRecycle)

    sysHandle.writeListToFile(newRecycle, pathRecycleOut)
    sysHandle.openDir(outDir)
    sys.exit()
Esempio n. 16
0
def processText(inFile, outDir):
    outFilePath = getOutPath(inFile, outDir)

    matches = sH.readTextFile(inFile)
    listMatch = matches.split("\n")
    cleanList = []
    for item in listMatch:
        parts = item.split(",")
        if (parts[0].strip()):
            cleanList.append(item)

    listMatch = list(dict.fromkeys(cleanList))

    sH.writeListToFile(listMatch, outFilePath)
    #print(listMatch)

    sH.openDir(outDir)
    sys.exit()
Esempio n. 17
0
def main(dirOut):
    WORD = "z"
    START_NUMBER = 2
    STOP_NUMBER = 3
    STEP_NUMBER = 1

    proxies = startPrivateProxy()

    for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER):
        print('processing list ', WORD, i)
        user_agent = getRamdomUserAgent()
        headers = {'User-Agent': user_agent}
        INDEX = str(i)
        pathOut = dirOut + '/list' + WORD + INDEX + ".txt"
        htmlContent = getLexico(WORD, INDEX, proxies, headers)
        wordData = processLexicoList(htmlContent, WORD)
        writeListToFile(wordData, pathOut)
        time.sleep(10)
Esempio n. 18
0
def processText(pathIn, dirOut, dirExclusion, dirLog):
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    #print('dirLog', dirLog)
    initialString = "Word_Extract_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, dirLog)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to extract words at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 1: read data file and split to get words
    words = sysHandle.getWordFromTextFile(pathIn)
    dateStamp = sysHandle.getDateStamp()
    message = "Reading word list completed at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 2: trim left, right, remove overlappings and sort
    wordList = cleanWordList(words)
    dateStamp = sysHandle.getDateStamp()
    message = "Trimming word list completed at " + dateStamp
    logData.append(message)
    print(message)
    #print(wordList)

    #STEP 3: remove items found in exclusion list, remove empty string
    exclusionList = sysHandle.loadDictionaries(dirExclusion)
    #print(exclusionList)
    cleanList = [w for w in wordList if w.lower() not in exclusionList]
    #remove empty items
    cleanList = [w for w in cleanList if w]

    #log activity
    dateStamp = sysHandle.getDateStamp()
    message = "Removing exluded items completed at " + dateStamp
    logData.append(message)
    print(message)

    #print(cleanList)
    sysHandle.writeListToFile(cleanList, pathOut)
    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Esempio n. 19
0
def uploadData(pathIn, bookID, dirLog):

    filePrefix = "Upload_Sentences_To_MySQL_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to upload sentences at " + dateStamp)
    logData.append("BookID being uploaded: " + str(bookID))
    #print(pathIn, bookID)
    sentence_total = upload_data(pathIn, bookID)

    logData.append("Total sentences written to MySQL " + str(sentence_total))
    #sysHandle.openDir(outDir)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence uploading completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sys.exit()
Esempio n. 20
0
def processTab3(pathClean, dirRaw, dirRecycle):
    #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle)

    pathRaw = getRawPath(pathClean, dirRaw)
    pathRecycle = getRawPath(pathClean, dirRecycle)

    #print('pathRaw', pathRaw)
    #print('\npathRecycle', pathRecycle)

    contentRaw = loadFormatPairFile(pathRaw)
    #print(contentRaw)
    contentClean = loadFormatPairFile(pathClean)
    #print(contentClean)

    recycleData = [item for item in contentRaw if item not in contentClean]

    dataOut = unpackPairs(recycleData)
    #print(dataOut)

    writeListToFile(dataOut, pathRecycle)
    openDir(dirRecycle)
    sys.exit()
Esempio n. 21
0
def mergePairs(dirIn, dirOut, dirRecycle):

    pathOut = sysHandle.getOutPath(dirOut)

    #print(pathOut)

    #print('dirIn:', dirIn)
    bigDict = sysHandle.loadWordPairs(dirIn)

    #print(bigDict)

    recycleList = sysHandle.loadWordPairs(dirRecycle)
    #print(recycleList)

    msqlList = getPairList()

    newDict = [item for item in bigDict if item not in msqlList]

    outDict = [item for item in newDict if item not in recycleList]

    newPairList = unpackPairs(outDict)

    sysHandle.writeListToFile(newPairList, pathOut)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir):
    #print ('logDir:', logDir, 'recyle Dir:', recycleDir)

    #print ('recycleList:', recycleList)

    initialString = "Dictionary_Check_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, logDir)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to directionary-check at " + dateStamp
    logData.append(message)
    print(message)

    pathOutClean = sysHandle.getRawPath(inFile, outDir)
    pathOutTrash = sysHandle.getRawPath(inFile, trashDir)
    #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash)
    rawList = convertList(sysHandle.readTextFile(inFile))
    dicList = sysHandle.loadDictionaries(dictDir)

    #split clean and trash based on dictionary
    listClean, listTrash = filterList(rawList, dicList)

    #split into lower case and upper case parts
    lowerClean, upperClean = splitDictByCase(listClean)

    #get a list of words from mysql database
    lowerDic, upperDic = splitDictByCase(getWordList())

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Loading dictionary completed at " + dateStamp
    logData.append(message)
    print(message)

    newUpperClean = [
        item for item in upperClean if item.lower() not in lowerDic
    ]

    newClean = newUpperClean + lowerClean

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Completed dictionary checking at " + dateStamp
    logData.append(message)
    print(message)

    recycleList = sysHandle.loadDictionaries(recycleDir)
    newListTrash = [item for item in listTrash if item not in recycleList]

    sysHandle.writeListToFile(newClean, pathOutClean)
    sysHandle.writeListToFile(newListTrash, pathOutTrash)

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Finished directionary checking at " + dateStamp
    logData.append(message)
    print(message)
    sysHandle.writeListToFile(logData, pathLog)

    sysHandle.openDir(outDir)
    sys.exit()
Esempio n. 23
0
            print('Connection Error with IP ' + str(item))
            statList.append('Connection Error with IP ' + str(item))
        except urllib3.exceptions.MaxRetryError:
            print('Max Retry Error with IP ' + str(item))
            statList.append('Max Retry Error with IP ' + str(item))

        except Exception as e:
            statList.append('Error verifying IP ' + str(item))
            print(e)

    file.close()

    dateStamp = sysHand.getDateStamp()

    statList.append('Finish verifying proxy at ' + dateStamp)

    return statList


if __name__ == '__main__':
    proxyDir = 'D:/Proxy/List'
    outPath = 'D:/Proxy/Filter/good_proxy_list.txt'
    logDir = 'D:/Proxy/Log'
    initialString = "Proxy_Verification_Log_"
    logPath = sysHand.getDatedFilePath(initialString, logDir)
    #clear file contents
    open(outPath, "w").close()
    proxyList = sysHand.loadProxyLines(proxyDir)
    statusList = verifyProxy(proxyList, outPath)
    sysHand.writeListToFile(statusList, logPath)
Esempio n. 24
0
def secondRun(fileName, dirIn, dirOut):

    pathIn = os.path.join(dirIn, fileName)

    pathOut = os.path.join(dirOut, fileName)

    lines = getLineFromTextFile(pathIn)

    lineTuple = []

    for line in lines:
        if (line):
            key, text = splitLine(line)
            lineTuple.append((key, text))
    #print(lineTuple)

    span = []
    lineMap = []

    for i in range(len(lineTuple) - 1):
        #print(lineTuple[i])
        #key = lineTuple[i][0]
        #value = lineTuple[i][1]
        #print(key, value)
        if (lineTuple[i][0] == lineTuple[i + 1][0]):
            #match
            span.append(i)
        else:
            span.append(i)
            lineMap.append(span)
            span = []

        #last item
        if (i == len(lineTuple) - 2):
            #print('i+1:', i+1)
            #print(lineTuple[i], lineTuple[i+1])
            if (lineTuple[i][0] != lineTuple[i + 1][0]):
                span = []
                span.append(i + 1)
                lineMap.append(span)

    dataOut = []
    #print(lineMap)
    for items in lineMap:
        if len(items) == 1:
            #print('single', items)
            #print()
            idx = items[0]
            #print(lineTuple[idx])
            line = lineTuple[idx][0] + lineTuple[idx][1]
            #print(line)
            dataOut.append(line)
        else:
            #print('series', items)
            header = ''
            text = ''
            for idx in items:
                if lineTuple[idx]:
                    #print(lineTuple[idx][0],  lineTuple[idx][1])
                    if not header:
                        header = lineTuple[idx][0]
                    text += lineTuple[idx][1] + '|'
            line = header + text
            dataOut.append(line)

    writeListToFile(dataOut, pathOut)
Esempio n. 25
0
import sys
from mysql_data import getWordList
from system_handler import writeListToFile, getWordFromTextFile

pathIn = "E:/FULLTEXT/TEMPOUT/GENERAL_MATCHING_PAIRS.txt"
pathOut = "E:/FULLTEXT/SPECIALTY/GENERAL_MATCHING_PAIRS.txt"

wordPairs = getWordFromTextFile(pathIn)

newPairs = []

for pair in wordPairs:
    if (pair):
        items = pair.split(',')
        newPairs.append(items[0].strip() + ', ' + items[1].strip())

results = list(dict.fromkeys(newPairs))
results.sort()
#print(results)
writeListToFile(results, pathOut)
Esempio n. 26
0
    #print(recentFile)
    fileList = os.listdir(dirIn)
    lastFile = ''
    prefix = 'Lexicon_Second_Run_Log_'
    logData = []
    logPath = getDatedFilePath(prefix, dirLog)
    #print('log path:', logPath)
    timeStamp = getDateStamp()
    message = 'Starting processing at ' + timeStamp
    logData.append(message)
    print(message)

    for item in fileList:
        if (item > recentFile):
            lastFile = item
            message = 'Processsing item ' + item
            logData.append(message)
            print(message)
            message = processRawText(item, dirIn, dirOut)
            logData.append(message)
            print(message)

    #WRITE INI
    cf.set_config_value(cf.RECENT_OPEN_FILE2, lastFile)
    timeStamp = getDateStamp()
    message = 'Finished processing at ' + timeStamp
    logData.append(message)
    print(message)
    writeListToFile(logData, logPath)
    openDir(dirOut)
Esempio n. 27
0
from nltk.corpus import words, stopwords
import system_handler as sysHand

#english_stopwords = stopwords.words('english')

#filePath = "E:/FULLTEXT/SPECIALTY/English_Stop_Words.txt"

#sysHand.writeListToFile(english_stopwords, filePath)

filePath = "E:/FULLTEXT/SPECIALTY/Full_Words_List.txt"
wordlist = words.words()

sysHand.writeListToFile(wordlist, filePath)

#print(english_stopwords)
Esempio n. 28
0

def formatWordList(records):
    wordList = []
    for word in records:
        wordList.append(word[0])
    return wordList


def getWordList():
    DB_NAME = "lexicon"
    db = get_connection(DB_NAME)
    cursor = db.cursor()
    select_sql = ("select distinct word from google_defs")
    try:
        cursor.execute(select_sql)
        records = cursor.fetchall()
        return formatWordList(records)
    except Exception as e:
        print("Error encountered:", e)
    finally:
        cursor.close
        db.close


if __name__ == "__main__":
    dirOut = "E:/FULLTEXT/SPECIALTY/"
    pathOut = dirOut + "Dictionary_Headword_List.txt"
    wordList = getWordList()
    writeListToFile(wordList, pathOut)
    openDir(dirOut)
Esempio n. 29
0
                    itemData.append(wordDiv)
        elif (item.name == 'strong'):
            wordStrong = processStrong(item)
            if (wordStrong):
                itemData.append(wordStrong)
    return itemData

    #entryList = soup.find_all('div', {'class' : 'entryWrapper'})
    #print(len(entryList))
    #for item in entryList:
    #	print(item)


if __name__ == "__main__":

    WORD = "a"
    dirOut = "E:/FULLTEXT/LEXICO/TEXT"
    pathIn = "E:/FULLTEXT/LEXICO/HTML/" + WORD + ".html"
    pathOut = getFilePath(pathIn, dirOut)

    #print(pathOut)

    wordData = []

    with open(pathIn, "r", encoding="utf-8") as file:
        contents = file.read()
        wordData = processLexico(contents)

    writeListToFile(wordData, pathOut)
    openDir(dirOut)
Esempio n. 30
0
import re
import system_handler as sysHand

inPath = "E:/FULLTEXT/SPECIALTY/Oxford_Word_List.txt"
outPath = "E:/FULLTEXT/SPECIALTY/Oxford_Two_Word_Compound.txt"

inputText = """
Aberdeen Angus
Aberdonian
"""

regPat = r'^\w+\s\w+$'
pattern = re.compile(regPat, re.M)
#finds = re.findall(pattern, inputText)
#print(finds)

data = sysHand.readTextFile(inPath)
wordList = data.split('\n')

twoWords = []

for word in wordList:
    matchObj = re.search(pattern, word)
    if (matchObj):
        twoWords.append(word)

sysHand.writeListToFile(twoWords, outPath)