def randomlySelectNDocsFromPath(folderPath, n=100):
    """ given a folder path, return a list of n randomly selected file paths """
    dejaVus = set()
    randomSelected = set()
    # get all the tmx files in the folder
    wholeFolderContent = utilsOs.goDeepGetFiles(folderPath, format=u'.tmx')
    # if there are less files in the folder path as in n then return them all
    if len(wholeFolderContent) <= n:
        return wholeFolderContent
    # get n randomly selected files from the whole
    for e in range(n):
        index = getRandomIntNeverseenBefore(len(wholeFolderContent), dejaVus)
        # add to dejavus and to the random selected list
        dejaVus.add(index)
        randomSelected.add(wholeFolderContent[index])
    # get the domain
    if folderPath[-1] == u'/':
        domain = folderPath[:-1].split(u'/')[-1]
    elif u'.' in folderPath.split(u'/')[-1]:
        path = folderPath.replace(u'/{0}'.format(folderPath.split(u'/')[-1]),
                                  u'')
        domain = path.split(u'/')[-1]
    else:
        domain = folderPath.split(u'/')[-1]
    # dump the set
    utilsOs.dumpDictToJsonFile(
        list(randomSelected),
        pathOutputFile='./randomSelected{0}{1}.json'.format(n, domain),
        overwrite=True)
    return randomSelected
Exemple #2
0
def makeSampleFileHavingNJobTitles(pathInput, pathOutput, n=1000000, addJobDescription=False):
	'''
	takes the real linkedIn data and makes a sample containing how
	many profiles necesary to achieve N functions (jobtitles)
	'''
	dictJobTitlesData = {}
	#sample of all candidates data
	outputJson = utilsOs.createEmptyFile(u'%ssample.json' %(pathOutput))
	
	#we read the original json file line by line
	with codecs.open(pathInput, u'r', encoding=u'utf8') as jsonFile:
		while len(dictJobTitlesData) < n:
			jsonData = jsonFile.readline()
			#we dump each line into the sample file
			outputJson.write(jsonData.replace(u'\r', ''))
			#we make a dict out of the string line
			jsonLine = utilsOs.convertJsonLineToDict(jsonData)
			if jsonLine != None:
				#we dump each job title into the jobtitle file
				dictJobTitlesData = getJobData(jsonLine, dictJobTitlesData)

	#dumping dict content in json
	utilsOs.dumpDictToJsonFile(dictJobTitlesData, pathOutputFile=u'%sjobTitlesDataDict.json'%(pathOutput))

	#SIMPLIFIED DATA dumping job title (and optional dexcription) to a file
	dumpJobTitleAndDescription(dictJobTitlesData, u'%sjob+pitch.tsv'%(pathOutput), addJobDescription)
	
	#closing the files
	outputJson.close()
	return None
Exemple #3
0
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	###NEED TO ANALYSE IF REMOVE IT AND REPLACE IT WITH makeTokenCountDictFromText() DEFINITELY
	######################################################################
	takes a corpus file, makes a dict of tokens with their count
	and dumps the result in a json file
	VERY SIMILAR TO makeTokenCountDictFromText() BUT MORE HANDS-ON AND SELF-BUILT
	'''
    import utilsOs
    tokenDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    for string in stringList:
        tokenList = naiveRegexTokenizer(string.replace(u'/', u' '))
        for token in tokenList:
            tokenDict[token] = tokenDict.get(token,
                                             0.0) + (1.0 / len(stringList))
            #we also add the lowercase version if there is an uppercase in the token
            if any(c.isupper() for c in token):
                tokenDict[token.lower()] = tokenDict.get(
                    token.lower(), 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'tokens.json'))
    utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath)
    return tokenDict
Exemple #4
0
def dumpSetToJson(aSet, pathOutput):
    '''
	Mixes all given taxonomies/ontologies 
	and returns a set of their content
	regardless of the hierarchy (flattens jobtitle at same level)
	'''
    jsonDict = {}
    for elem in aSet:
        jsonDict[elem] = None
    utilsOs.dumpDictToJsonFile(jsonDict, pathOutput)
    return
def applyOnNotFlaggedForNHours(n=1):
    schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json'
    scheduleDict = utilsOs.openJsonFileAsDict(schedule)
    # apply for n hours
    for nId in list(scheduleDict.keys()[:n]):
        indexesToApply = scheduleDict[nId]
        applyHeuristicsOnNotFlaggedCorpus(indexesToApply, nId)
        # remove from the dict once we dump the scores
        del scheduleDict[nId]
    # save the remaining schedule dict
    utilsOs.dumpDictToJsonFile(scheduleDict,
                               pathOutputFile=schedule,
                               overwrite=True)
def makeHourlyIndexDict():
    folderPath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/'
    with open(u'{0}files.paths'.format(folderPath)) as pathsFile:
        nbPaths = len(pathsFile.readlines())
    id = 0
    lastIdx = 0
    aDict = {}
    for idx in range(0, nbPaths, 600):  # range(364949, nbPaths, 600)
        aDict[id] = list(range(lastIdx, idx))
        lastIdx = idx
        id += 1
    aDict[id] = list(range(lastIdx, 394949))
    schedule = u'{0}heurSchedule.json'.format(folderPath)
    utilsOs.dumpDictToJsonFile(aDict, pathOutputFile=schedule, overwrite=True)
Exemple #7
0
def launchTmop(inputFilePath, pharaohFilePath, tokFilePath, outputFolderPath,
               **kwargs):
    utilsOs.createEmptyFolder(outputFolderPath)
    ##########
    # get and modif the config file
    configDict = getConfigTemplate()
    configDict["options"]["input file"] = inputFilePath
    configDict["options"]["align file"] = pharaohFilePath
    configDict["options"]["token file"] = tokFilePath
    configDict["options"]["output folder"] = outputFolderPath
    ##########
    # add a policy
    configDict["policies"].append(["FourNo", "on"])
    configDict["policies"].append(["ThreeNo", "on"])
    configDict["policies"].append(["TwoNo", "on"])
    ##########
    # turn off certain heuristics to launch only the interesting ones
    # # configDict["filters"][0][1] = "off" # "SampleFilter"
    # configDict["filters"][1][1] = "off" # "LengthStats"
    # # configDict["filters"][2][1] = "off" # "LengthRatio"
    # # configDict["filters"][3][1] = "off" # "ReverseLengthRatio"
    # # configDict["filters"][4][1] = "off" # "WordRatio"
    # # configDict["filters"][5][1] = "off" # "ReverseWordRatio"
    # # configDict["filters"][6][1] = "off" # "WordLength"
    # # configDict["filters"][7][1] = "off" # "TagFinder"
    # # configDict["filters"][8][1] = "off" # "RepeatedChars"
    # # configDict["filters"][9][1] = "off" # "RepeatedWords"
    # # configDict["filters"][10][1] = "off" # "Lang_Identifier"
    # configDict["filters"][11][1] = "off" # "AlignedProportion"
    # # configDict["filters"][12][1] = "off" # "BigramAlignedProportion"
    # configDict["filters"][13][1] = "off" # "NumberOfUnalignedSequences"
    # configDict["filters"][14][1] = "off" # "LongestAlignedSequence"
    # configDict["filters"][15][1] = "off" # "LongestUnalignedSequence"
    # configDict["filters"][16][1] = "off" # "AlignedSequenceLength"
    # configDict["filters"][17][1] = "off" # "UnalignedSequenceLength"
    # configDict["filters"][18][1] = "off" # "FirstUnalignedWord"
    # configDict["filters"][19][1] = "off" # "LastUnalignedWord"
    # configDict["filters"][20][1] = "off" # "WE_Average"
    # configDict["filters"][21][1] = "off" # "WE_Median"
    # configDict["filters"][22][1] = "off" # "WE_BestAlignScore"
    # configDict["filters"][23][1] = "off" # "WE_ScoreOtherAlignment"
    # configDict["filters"][24][1] = "off" # "WE_ScoreAlign_BestForRest"
    for k, v in kwargs:
        configDict["options"][k] = v
    # dump the config.json file
    tmopFolder = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/TMOP-master"
    utilsOs.dumpDictToJsonFile(configDict,
                               "{0}/config.json".format(tmopFolder),
                               overwrite=True)
def makeDictFromTsvTrain(pathToTrainTsv,
                         trainingDataColumnName,
                         goldStandardColumnName,
                         trainedDict=None,
                         outputDictFilePath=None,
                         preOrorazeOrig=False,
                         alignMostSimilar=False):
    '''
	Given a path to a "train" file, applies a heuristic dict maker
	it opens it, searches for all possible general language spell 
	corrections and chooses the ones reappearing somewhere else 
	in the corpus
	'''
    trainedDict = trainedDict if trainedDict != None else {}
    #open the train dataframe from the path
    trainDf = utilsOs.getDataFrameFromArgs(pathToTrainTsv)
    #get the specific data we want to use as train to populate our dict (original and gold standard)
    trainDataDf = trainDf[[trainingDataColumnName, goldStandardColumnName]]
    #get the gold standard data to which compare the training data
    for indexRow, row in trainDataDf.iterrows():
        #get the elements not matching exactly
        if preOrorazeOrig == False:
            nonMatchingAlignment = getNonExactMatch((row[0]), row[1])
        #we preororaze if asked to limit the difference in segmentation and orora-syntax-oriented problems
        else:
            nonMatchingAlignment = getNonExactMatch(advancedOroraze(row[0]),
                                                    row[1])
        #if the list is not empty
        if nonMatchingAlignment:
            for nonMatchTupl in nonMatchingAlignment:
                #use the original token as a key
                trainedDict[nonMatchTupl[0]] = trainedDict.get(
                    nonMatchTupl[0], list()) + [nonMatchTupl[1]]
    #clean the dict
    for origKey, goldValList in dict(trainedDict).items():
        #eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol
        if len(goldValList) != 1 or goldValList[0] == u'∅':
            del trainedDict[origKey]
        else:
            trainedDict[origKey] = goldValList[0]
    #dump the dict
    if outputDictFilePath != None:
        utilsOs.dumpDictToJsonFile(trainedDict,
                                   outputDictFilePath,
                                   overwrite=True)
    return trainedDict
Exemple #9
0
def quadrigramDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of 4grams with their cooccurrence
	and dumps the result in a json file
	'''
    quadrigramDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    langString = u' '.join(stringList)
    for i in range(len(langString) - 3):
        quadrigramDict[langString[i:i + 4]] = quadrigramDict.get(
            langString[i:i + 4], 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'quadrigrams.json'))
    utilsOs.dumpDictToJsonFile(quadrigramDict, outputFilePath)
    return quadrigramDict
Exemple #10
0
def trigramDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of character 3grams with their count
	and dumps the result in a json file
	'''
    import utilsOs
    trigramDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    langString = u' '.join(stringList)
    for i in range(len(langString) - 2):
        trigramDict[langString[i:i + 3]] = trigramDict.get(
            langString[i:i + 3], 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'trigrams.json'))
    utilsOs.dumpDictToJsonFile(trigramDict, outputFilePath)
    return trigramDict
Exemple #11
0
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of tokens with their cooccurrence
	and dumps the result in a json file
	'''
    tokenDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    for string in stringList:
        tokenList = naiveRegexTokenizer(string.replace(u'/', u' '))
        for token in tokenList:
            tokenDict[token] = tokenDict.get(token,
                                             0.0) + (1.0 / len(stringList))
            #we also add the lowercase version if there is an uppercase in the token
            if any(c.isupper() for c in token):
                tokenDict[token.lower()] = tokenDict.get(
                    token.lower(), 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'tokens.json'))
    utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath)
    return tokenDict
Exemple #12
0
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment):
	'''
	given the path to the sigma.js exportation environment (ending in 
	the folder "network/"), it changes the config.json file and the index.html
	file so they show the graph the way intended
	'''
	#copying config.json file
	configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}}
	pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment)
	if utilsOs.theFileExists(pathConfigJson) == True:
		os.remove(pathConfigJson)
	utilsOs.dumpDictToJsonFile(configContent, pathConfigJson)  
	#getting the color information from the data file
	colorCommunityDict = {}
	dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment))
	for nodeDict in dataDict[u'nodes']:
		try:
			if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0'])
			'''
			#####################################################
			#before I changed the names of the columns
			if nodeDict[u'attributes'][u'community'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name'])
			'''
		except KeyError:
			pass
	#modifying the index.html file
	with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile:
		fileLines = indexFile.readlines()
		for index, line in enumerate(fileLines):
			if line == u'\t\t<dt class="colours"></dt>\n':
				indexDivisor = index + 1
				break
		fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:]
	utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
Exemple #13
0
def analyseNodeListStrDistance(nodeListPath, outputPath=None):
    '''
	analyses the nodes in the node list and returns the stats 
	concerning the similarities between node string labels
	'''
    import multiprocessing as mp

    pool = mp.Pool(processes=4)
    nodeSimilarsDict = {1: {}, 2: {}, 3: {}}
    nodeSetJobTitles = set()
    nodeSetSkills = set()
    #put the node Labels in a set
    with open(nodeListPath) as nodeFile:
        nodeData = nodeFile.readline()
        while nodeData:
            #get the data for each row
            nodeDataList = nodeData.split(u'\t')
            #we make sure we are not in the header
            if nodeDataList[0] != u'Id':
                #save the node id/label in a set
                if u'__s' in nodeDataList[0]:
                    nodeSetJobTitles.add(nodeDataList[1])
                elif u'__t' in nodeDataList[0]:
                    nodeSetSkills.add(nodeDataList[1])
            #get next line
            nodeData = nodeFile.readline()

    #get the number and list of N-similar nodes for each Job title node
    jobtitleResults = [
        pool.apply_async(getElemSimilarByEditDistanceOfN,
                         args=(original, nodeSetJobTitles, nodeSimilarsDict,
                               True, u'{0}__s'.format(original)))
        for original in nodeSetJobTitles
    ]
    #get the number and list of N-similar nodes for each skill node
    skillResults = [
        pool.apply_async(getElemSimilarByEditDistanceOfN,
                         args=(original, nodeSetSkills, nodeSimilarsDict, True,
                               u'{0}__t'.format(original)))
        for original in nodeSetSkills
    ]

    #merge all the obtained dict together
    def merge_two_dicts(x, y):
        w = x.copy()
        for nb in range(1, 4):
            z = w[nb].copy()  # start with x's keys and values
            z.update(
                y[nb])  # modifies z with y's keys and values & returns None
            w[nb] = z
        return w

    #prepare the objects containing the results
    dictResults = {1: {}, 2: {}, 3: {}}
    for dictToBeAdded in tqdm(jobtitleResults + skillResults):
        dictResults = merge_two_dicts(dictResults, dictToBeAdded.get())

    #dump into a json file
    if outputPath != None:
        utilsOs.dumpDictToJsonFile(dictResults, outputPath, overwrite=True)
    #get the summary of the results
    countResultStrDistanceDict(dictResults)
    return dictResults
Exemple #14
0
def getCanonLensInfo():
    canonDict = {}
    canonUrl = u'https://global.canon/en/c-museum/series_search.html?t=lens&s=ef'
    # open the selenium driver
    driver = webdriver.Firefox()
    driver.get(canonUrl)
    # get te lens keys
    for nb in range(2, 300):
        try:
            lensName = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div[4]/div/div[{0}]'.format(
                    nb))
            lensUrl = (
                lensName.find_element_by_tag_name('a')).get_attribute(u'href')
            time.sleep(random.uniform(0.2, 0.5))
        except NoSuchElementException:
            try:
                driver.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight*2);')
                time.sleep(random.uniform(0.5, 0.8))
                lensName = driver.find_element_by_xpath(
                    u'/html/body/div[2]/div/div/main/div[4]/div/div[{0}]'.
                    format(nb))
                lensUrl = (lensName.find_element_by_tag_name('a')
                           ).get_attribute(u'href')
            except NoSuchElementException:
                break
        canonDict[str(lensName.text).replace(u'\n', u' ')] = {u'url': lensUrl}
        # print(lensName.text.replace(u'/n', u' '))
    # open each lens url to get the specs
    for lName, lensDict in canonDict.items():
        print(lName)
        driver.get(lensDict[u'url'])
        time.sleep(random.uniform(0.8, 1.1))
        lensList = lName.split(u' ')
        # get the mount type and focal length
        focalLength = lensList[0] if lensList[0] != u'Extender' else lensList[1]
        for mount in [
                u'EF-S', u'EF-M', u'EF-R', u'FD', u'EF', u'TS-E', u'MP-E'
        ]:
            if mount in lName:
                lensMount = mount
                focalLength = focalLength.replace(mount, u'')
                break
        canonDict[lName][u'mount type'] = lensMount
        canonDict[lName][u'focal length'] = focalLength
        # aperture speed (f number)
        canonDict[lName][u'aperture speed'] = lensList[1] if u'f/' in lensList[
            1] else u'NaN'
        # AF
        af = u'unk'
        for afType in [u'USM', u'STM']:
            if afType in lName:
                af = afType
                break
        canonDict[lName][u'AF type'] = af
        # mark
        mark = u'I'
        for markNb in [u'II', u'III', u'IV', u'V', u'VI']:
            if markNb in lName:
                mark = markNb
                break
        canonDict[lName][u'mark'] = mark
        # IS
        canonDict[lName][
            u'image stabilization'] = u'IS' if u'IS' in lName else u'NaN'
        # lens type
        lensType = u'NaN'
        for lType in [u'Macro', u'Fisheye', u'Extender']:
            if lType in lName:
                lensType = lType
                break
        canonDict[lName][u'lens type'] = lensType
        # luxe series
        canonDict[lName][u'L series'] = u'L' if u'L' in lensList[1] else u'NaN'
        # get the rest of the specs
        row = 1
        canonDict[lName][u'release date'] = driver.find_element_by_xpath(
            u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
            .format(row)).text
        row += 1
        canonDict[lName][u'original price'] = driver.find_element_by_xpath(
            u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
            .format(row)).text
        row += 1
        canonDict[lName][
            u'nb of construction groups'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        row += 1
        canonDict[lName][
            u'nb of construction elements'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        if u'Diaphragm' in driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]'
                .format(row + 1)).text:
            row += 1
        canonDict[lName][
            u'nb of diaphragm blades'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        if u'Minimum' in driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]'
                .format(row + 1)).text:
            row += 1
        canonDict[lName][u'min aperture'] = driver.find_element_by_xpath(
            u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
            .format(row)).text
        if u'Closest' in driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]'
                .format(row + 1)).text:
            row += 1
        canonDict[lName][
            u'closest focus distance (m)'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        row += 1
        canonDict[lName][
            u'max magnification (x)'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        if u'Filter' in driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]'
                .format(row + 1)).text:
            row += 1
        canonDict[lName][u'filter diam (mm)'] = driver.find_element_by_xpath(
            u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
            .format(row)).text
        row += 1
        canonDict[lName][
            u'max diam X length (mm)'] = driver.find_element_by_xpath(
                u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
                .format(row)).text
        row += 1
        canonDict[lName][u'weight (g)'] = driver.find_element_by_xpath(
            u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]'
            .format(row)).text
    # dump
    utilsOs.dumpDictToJsonFile(canonDict,
                               pathOutputFile='./data/canonLensInfo.json',
                               overwrite=True)
    driver.close()
Exemple #15
0
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True):
    """
    make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++
    then reformats the corpus into a the format needed by giza++ or mgiza++
    :param pathToEnFile: path to the english sentences file
    :param pathToFrFile: path to the french sentences file
    :return: None
    """
    # prepare the output paths
    outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb")
    outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb")
    outputPathGizaFormatCorpus = prepareOutPutFile(
        pathToEnFile, fileName=u"sentenceFile.giza")
    outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json")
    outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json")
    outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json")
    outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json")
    outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json")
    # if there is not a file there yet, open the corpus Files, count the frequency of each token
    if overwrite is True or os.path.isfile(outputEnDictPath) is False:
        # make the frequency dict
        enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en")
        frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr")
        # open the corpus files count the frequency of the sentence pairs
        spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        # make the id dict
        enIdDict = makeIdDict(orderedKeysValuesEn)
        frIdDict = makeIdDict(orderedKeysValuesFr)
        # dump dicts
        utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite)
    # if the file already exists or if overwrite is false
    else:
        enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath)
        frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath)
        spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath)
        enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath)
        frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
    # dump the empty tok voc file
    if overwrite is True:
        firstLine = u"1\tUNK\t0"
        utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputPathGizaFormatCorpus)
    # dump the dict in the tok voc file
    for indKv, kv in enumerate(orderedKeysValuesEn):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True)
    for indKv, kv in enumerate(orderedKeysValuesFr):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True)
    # transform and dump the corpus into the GIZA format
    appendToDumpInGizaFormat(pathToEnFile, pathToFrFile,
                             outputPathGizaFormatCorpus, enIdDict, frIdDict,
                             spFreqDict)
    return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def extractMisalignedSP(pathToSrcTrgtFiles,
                        extractionSize=100,
                        typeOfExtractors=[0, 1, 2]):
    """ given a path to the original source and target files, and the types of
    extractors to be used returns SP (sentence pairs) extracted as misaligned
    extractor types:
    - 0 : same number presence in src and trgt
    - 1 : 4 or less than 4 tokens
    - 2 : """
    extractedSp = {0: {}, 1: {}, 2: {}}
    totalLines = 0

    # get name of subset
    for subset in [
            u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY'
    ]:
        if subset in pathToSrcTrgtFiles:
            subsetName = subset
    # type 1 block
    output1Path = u'./003negativeNaiveExtractors/numberCoincidence/'
    utilsOs.createEmptyFolder(output1Path)
    # type 2 block
    output1Path = u'./003negativeNaiveExtractors/fewTokens/'
    utilsOs.createEmptyFolder(output1Path)
    # type 3 block
    output2Path = u'./003negativeNaiveExtractors/cognates/'
    utilsOs.createEmptyFolder(output2Path)
    # get the path to the src and trgt files
    srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx')
    print(u'TOTAL FILES : ', len(srcTrgtFiles))
    for filePath in srcTrgtFiles:
        srcFilePath = u'{0}.en'.format(
            filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
        trgtFilePath = u'{0}.fr'.format(
            filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
        # open line by line and apply extractors
        try:
            with open(srcFilePath) as srcFile:
                with open(trgtFilePath) as trgtFile:
                    srcLines = srcFile.readlines()
                    trgtLines = trgtFile.readlines()
                    for srcLnIndex, srcLn in enumerate(srcLines):
                        trgtLn = trgtLines[srcLnIndex]
                        # tokenize
                        srcLn = srcLn.lower().replace(u' pm', u'pm')
                        trgtLn = trgtLn.lower().replace(u' pm', u'pm')
                        addSeparators = [
                            u'.', u',', u':', u'/', u'-', u"''", u"'"
                        ]
                        srcTokens = utilsString.nltkTokenizer(
                            srcLn, addSeparators)
                        trgtTokens = utilsString.nltkTokenizer(
                            trgtLn, addSeparators)
                        # apply the extractors
                        if 0 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                nbMismatch, 0.75, srcTokens, trgtTokens,
                                extractedSp, filePath, 0, int(srcLnIndex))
                        if 1 in typeOfExtractors:
                            # get context scores and location in doc
                            cntxtScores = getContextScores(
                                srcLnIndex, srcLines, trgtLines)
                            docLoc = srcLnIndex / len(srcLines)
                            extractedSp, score = applyExtractor(
                                tableOfContents,
                                0.32,
                                srcTokens,
                                trgtTokens,
                                extractedSp,
                                filePath,
                                1,
                                int(srcLnIndex),
                                contextScores=cntxtScores,
                                placeInDocument=docLoc)
                        if 2 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                cognateCoincidence, 0.1, srcTokens, trgtTokens,
                                extractedSp, filePath, 2, int(srcLnIndex))
                    totalLines += len(srcLines)
        # some folders have no .en and .fr to each .tmx file
        # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/')
        except FileNotFoundError:
            pass
    print(u'TOTAL LINES : ', totalLines)
    # dump the extracted sp dict into a json file
    utilsOs.dumpDictToJsonFile(
        extractedSp,
        pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json',
        overwrite=True)
    # randomly extract and dump the file path and the line index for the extracted SP
    randomlyExtractAndDump(extractedSp, 100, subsetName)