def mergeBlocks(file_handle1, file_handle2, index_file_name): file_name = str(index_file_name) index = open(file_name, "w") buffer1 = [] buffer2 = [] eof1 = False eof2 = False for i in range(1000): term = file_handle1.readline() buffer1.append(makeTuple(term)) term = file_handle2.readline() buffer2.append(makeTuple(term)) while eof1 == False or eof2 == False: # begin term comparison. The lowest term is written to disc and # popped from the buffer if buffer2 != [] and buffer1 != []: if buffer1[0][0] < buffer2[0][0]: index.write(str(buffer1.pop(0)) + "\n") elif buffer1[0][0] > buffer2[0][0]: index.write(str(buffer2.pop(0)) + "\n") elif buffer1[0][0] == buffer2[0][0]: combined_postings = buffer1[0][1] + buffer2[0][1] merged_term = (buffer1[0][0], combined_postings) index.write(str(merged_term) + "\n") buffer1.pop(0) buffer2.pop(0) # if buffer1 is empty add all entries of buffer2 to disc # and vice versa elif buffer1 == [] and buffer2 != []: while buffer2 != []: index.write(str(buffer2.pop(0)) + "\n") elif buffer2 == [] and buffer1 != []: while buffer1 != []: index.write(str(buffer1.pop(0)) + "\n") # check if buffers are empty and refill them if so if buffer1 == []: for i in range(1000): term = file_handle1.readline() if term != "": buffer1.append(makeTuple(term)) if buffer1 == []: print("buffer1 is empty") eof1 = True if buffer2 == []: for i in range(1000): term = file_handle2.readline() if term != "": buffer2.append(makeTuple(term)) if buffer2 == []: print("buffer2 is empty") eof2 = True print(eof1, eof2) index.close()
def _buildSpecList(self, options): ''' Use the passed arguments and build a list of he configurations that shall be generated. ''' # if no specific configurations are given, generate tests for all # configurations if options.configurations is None or options.configurations == '*': return discovery.getSpecList() # regexes that are used to interpret the 'configurations' # console parameter identRegex = r"'[a-zA-Z](\s?[a-zA-Z0-9_.-])*'" wildcardRegex = r"'\*'" specListRegex = r"\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*" + \ r"(\s*;\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*)*" LTARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \ r"\s*,\s*" + identRegex + r"\s*\)\s*" LTWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \ r"\s*,\s*" + wildcardRegex + r"\s*\)\s*" LWARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \ r"\s*,\s*" + identRegex + r"\s*\)\s*" LWWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \ r"\s*,\s*" + wildcardRegex + r"\s*\)\s*" if re.match(specListRegex, options.configurations): # use configurations list parameter tmp = options.configurations.split(';') # enclose tuple values in quotation marks, required for # the makeTuple method for i in range(0, len(tmp)): conf = tmp[i].strip() # remove parentheses and leading/trailing whitespace conf = conf[1:][:-1].strip() # get tuple values and remove leading/trailing whitespace vals = [v.strip() for v in conf.split(',')] # add quotation marks and concat again conf = '(' + ','.join(["'" + v + "'" for v in vals]) + ')' # replace original config tmp[i] = conf try: tmp = [makeTuple(el.strip()) for el in tmp] valid = True except: valid = False if not valid: raise IOError('Invalid specification list:' +\ options.configurations +\ '\nRun "python3 -m itf1788 --help" to see the\ correct syntax.' ) specList = [] for spec in tmp: if re.match(LTARegex, str(spec)): specList += [spec] elif re.match(LTWRegex, str(spec)): specList += \ discovery.getSpecListByLanguageAndTestLibrary(spec[0], spec[1]) elif re.match(LWARegex, str(spec)): specList += \ discovery.getSpecListByLanguageAndArithmeticLibrary(spec[0], spec[2]) elif re.match(LWWRegex, str(spec)): specList += discovery.getSpecListByLanguage(spec[0]) else: raise IOError('Invalid configurations specification: ' + options.configurations) return specList
from nltk import PorterStemmer from ast import literal_eval as makeTuple import math, operator ## Load dictionary into memory dictionary = {} print("Loading index…") with open("dictionary", "r") as dict_file: for line in dict_file: term = makeTuple(line) dictionary[term[0]] = term[1] ## Load preprocessed corpus corpus = {} print("Loading preprocessed corpus…") with open("corpus", "r") as corpus_file: for line in corpus_file: doc = makeTuple(line) corpus[doc[0]] = doc[1] ## compute average document length accum = 0 for key in corpus: accum += len(corpus[key]) L_av = accum / len(corpus) ## Load stemmer stemmer = PorterStemmer() ## OR flag to determine whether query contains ## a disjunction
def nextToken(): str_token = tokenised_corpus.readline() if str_token == "": return "" token = makeTuple(str_token) return token
def _buildSpecList(self, options): ''' Use the passed arguments and build a list of he configurations that shall be generated. ''' # if no specific configurations are given, generate tests for all # configurations if options.configurations is None or options.configurations == '*': return discovery.getSpecList() # regexes that are used to interpret the 'configurations' # console parameter identRegex = r"'[a-zA-Z](\s?[a-zA-Z0-9_.-])*'" wildcardRegex = r"'\*'" specListRegex = r"\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*" + \ r"(\s*;\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*)*" LTARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \ r"\s*,\s*" + identRegex + r"\s*\)\s*" LTWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \ r"\s*,\s*" + wildcardRegex + r"\s*\)\s*" LWARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \ r"\s*,\s*" + identRegex + r"\s*\)\s*" LWWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \ r"\s*,\s*" + wildcardRegex + r"\s*\)\s*" if re.match(specListRegex, options.configurations): # use configurations list parameter tmp = options.configurations.split(';') # enclose tuple values in quotation marks, required for # the makeTuple method for i in range(0, len(tmp)): conf = tmp[i].strip() # remove parentheses and leading/trailing whitespace conf = conf[1:][:-1].strip() # get tuple values and remove leading/trailing whitespace vals = [v.strip() for v in conf.split(',')] # add quotation marks and concat again conf = '(' + ','.join(["'" + v + "'" for v in vals]) + ')' # replace original config tmp[i] = conf try: tmp = [makeTuple(el.strip()) for el in tmp] valid = True except: valid = False if not valid: raise IOError('Invalid specification list:' +\ options.configurations +\ '\nRun "python3 -m itf1788 --help" to see the\ correct syntax.') specList = [] for spec in tmp: if re.match(LTARegex, str(spec)): specList += [spec] elif re.match(LTWRegex, str(spec)): specList += \ discovery.getSpecListByLanguageAndTestLibrary(spec[0], spec[1]) elif re.match(LWARegex, str(spec)): specList += \ discovery.getSpecListByLanguageAndArithmeticLibrary(spec[0], spec[2]) elif re.match(LWWRegex, str(spec)): specList += discovery.getSpecListByLanguage(spec[0]) else: raise IOError('Invalid configurations specification: ' + options.configurations) return specList