def mergeBlocks(file_handle1, file_handle2, index_file_name):
    file_name = str(index_file_name)
    index = open(file_name, "w")
    buffer1 = []
    buffer2 = []
    eof1 = False
    eof2 = False
    for i in range(1000):
        term = file_handle1.readline()
        buffer1.append(makeTuple(term))
        term = file_handle2.readline()
        buffer2.append(makeTuple(term))
    while eof1 == False or eof2 == False:

        # begin term comparison. The lowest term is written to disc and
        # popped from the buffer
        if buffer2 != [] and buffer1 != []:
            if buffer1[0][0] < buffer2[0][0]:
                index.write(str(buffer1.pop(0)) + "\n")
            elif buffer1[0][0] > buffer2[0][0]:
                index.write(str(buffer2.pop(0)) + "\n")
            elif buffer1[0][0] == buffer2[0][0]:
                combined_postings = buffer1[0][1] + buffer2[0][1]
                merged_term = (buffer1[0][0], combined_postings)
                index.write(str(merged_term) + "\n")
                buffer1.pop(0)
                buffer2.pop(0)
        # if buffer1 is empty add all entries of buffer2 to disc
        # and vice versa
        elif buffer1 == [] and buffer2 != []:
            while buffer2 != []:
                index.write(str(buffer2.pop(0)) + "\n")
        elif buffer2 == [] and buffer1 != []:
            while buffer1 != []:
                index.write(str(buffer1.pop(0)) + "\n")

        # check if buffers are empty and refill them if so
        if buffer1 == []:
            for i in range(1000):
                term = file_handle1.readline()
                if term != "":
                    buffer1.append(makeTuple(term))
        if buffer1 == []:
            print("buffer1 is empty")
            eof1 = True
        if buffer2 == []:
            for i in range(1000):
                term = file_handle2.readline()
                if term != "":
                    buffer2.append(makeTuple(term))
        if buffer2 == []:
            print("buffer2 is empty")
            eof2 = True
    print(eof1, eof2)

    index.close()
Beispiel #2
0
    def _buildSpecList(self, options):
        '''
            Use the passed arguments and build a list of he configurations that
            shall be generated.
            '''
        # if no specific configurations are given, generate tests for all
        # configurations
        if options.configurations is None or options.configurations == '*':
            return discovery.getSpecList()

        # regexes that are used to interpret the 'configurations'
        # console parameter
        identRegex = r"'[a-zA-Z](\s?[a-zA-Z0-9_.-])*'"
        wildcardRegex = r"'\*'"
        specListRegex = r"\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*" + \
                        r"(\s*;\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*)*"
        LTARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \
                   r"\s*,\s*" + identRegex + r"\s*\)\s*"
        LTWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \
                   r"\s*,\s*" + wildcardRegex + r"\s*\)\s*"
        LWARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \
                   r"\s*,\s*" + identRegex + r"\s*\)\s*"
        LWWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \
                   r"\s*,\s*" + wildcardRegex + r"\s*\)\s*"

        if re.match(specListRegex, options.configurations):
            # use configurations list parameter
            tmp = options.configurations.split(';')

            # enclose tuple values in quotation marks, required for
            # the makeTuple method
            for i in range(0, len(tmp)):
                conf = tmp[i].strip()
                # remove parentheses and leading/trailing whitespace
                conf = conf[1:][:-1].strip()
                # get tuple values and remove leading/trailing whitespace
                vals = [v.strip() for v in conf.split(',')]
                # add quotation marks and concat again
                conf = '(' + ','.join(["'" + v + "'" for v in vals]) + ')'
                # replace original config
                tmp[i] = conf

            try:
                tmp = [makeTuple(el.strip()) for el in tmp]
                valid = True
            except:
                valid = False
            if not valid:
                raise IOError('Invalid specification list:' +\
                                options.configurations +\
                                '\nRun "python3 -m itf1788 --help" to see the\
                                    correct syntax.'                                                )

            specList = []

            for spec in tmp:
                if re.match(LTARegex, str(spec)):
                    specList += [spec]
                elif re.match(LTWRegex, str(spec)):
                    specList += \
                     discovery.getSpecListByLanguageAndTestLibrary(spec[0],
                                                                   spec[1])
                elif re.match(LWARegex, str(spec)):
                    specList += \
                     discovery.getSpecListByLanguageAndArithmeticLibrary(spec[0],
                                                                         spec[2])
                elif re.match(LWWRegex, str(spec)):
                    specList += discovery.getSpecListByLanguage(spec[0])

        else:
            raise IOError('Invalid configurations specification: ' +
                          options.configurations)
        return specList
Beispiel #3
0
from nltk import PorterStemmer
from ast import literal_eval as makeTuple
import math, operator

## Load dictionary into memory
dictionary = {}
print("Loading index…")
with open("dictionary", "r") as dict_file:
    for line in dict_file:
        term = makeTuple(line)
        dictionary[term[0]] = term[1]

## Load preprocessed corpus
corpus = {}
print("Loading preprocessed corpus…")
with open("corpus", "r") as corpus_file:
    for line in corpus_file:
        doc = makeTuple(line)
        corpus[doc[0]] = doc[1]

## compute average document length
accum = 0
for key in corpus:
    accum += len(corpus[key])
L_av = accum / len(corpus)

## Load stemmer
stemmer = PorterStemmer()

## OR flag to determine whether query contains
## a disjunction
def nextToken():
    str_token = tokenised_corpus.readline()
    if str_token == "": return ""
    token = makeTuple(str_token)
    return token
Beispiel #5
0
        def _buildSpecList(self, options):
            '''
            Use the passed arguments and build a list of he configurations that
            shall be generated.
            '''
            # if no specific configurations are given, generate tests for all
            # configurations
            if options.configurations is None or options.configurations == '*':
                return discovery.getSpecList()
            
            
            # regexes that are used to interpret the 'configurations'
            # console parameter
            identRegex = r"'[a-zA-Z](\s?[a-zA-Z0-9_.-])*'"
            wildcardRegex = r"'\*'"
            specListRegex = r"\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*" + \
                            r"(\s*;\s*\(\s*.+\s*,\s*.+\s*,\s*.+\s*\)\s*)*"
            LTARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \
                       r"\s*,\s*" + identRegex + r"\s*\)\s*"
            LTWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + identRegex + \
                       r"\s*,\s*" + wildcardRegex + r"\s*\)\s*"
            LWARegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \
                       r"\s*,\s*" + identRegex + r"\s*\)\s*"
            LWWRegex = r"\s*\(\s*" + identRegex + r"\s*,\s*" + wildcardRegex + \
                       r"\s*,\s*" + wildcardRegex + r"\s*\)\s*"

            if re.match(specListRegex, options.configurations):
                # use configurations list parameter
                tmp = options.configurations.split(';')

                # enclose tuple values in quotation marks, required for
                # the makeTuple method
                for i in range(0, len(tmp)):
                    conf = tmp[i].strip()
                    # remove parentheses and leading/trailing whitespace
                    conf = conf[1:][:-1].strip()
                    # get tuple values and remove leading/trailing whitespace
                    vals = [v.strip() for v in conf.split(',')]
                    # add quotation marks and concat again
                    conf = '(' + ','.join(["'" + v + "'" for v in vals]) + ')'
                    # replace original config
                    tmp[i] = conf

                try:
                    tmp = [makeTuple(el.strip()) for el in tmp]
                    valid = True
                except:
                    valid = False
                if not valid:
                    raise IOError('Invalid specification list:' +\
                                    options.configurations +\
                                    '\nRun "python3 -m itf1788 --help" to see the\
                                    correct syntax.')

                specList = []

                for spec in tmp:
                    if re.match(LTARegex, str(spec)):
                        specList += [spec]
                    elif re.match(LTWRegex, str(spec)):
                        specList += \
                         discovery.getSpecListByLanguageAndTestLibrary(spec[0],
                                                                       spec[1])
                    elif re.match(LWARegex, str(spec)):
                        specList += \
                         discovery.getSpecListByLanguageAndArithmeticLibrary(spec[0],
                                                                             spec[2])
                    elif re.match(LWWRegex, str(spec)):
                        specList += discovery.getSpecListByLanguage(spec[0])

            else:
                raise IOError('Invalid configurations specification: ' +
                              options.configurations)
            return specList