Exemple #1
0
    def test_util_file_create_if_necessary__file_read(self):
        if self._test_exec("test_file_create_if_necessary"):
            Prg = self.Prg
            Fname = os.path.join(Prg["DirWork"],
                                 "test_file_create_if_necessary.txt")
            util.file_del(Fname)

            Content = "Cloud\nRain\nSun\r\nSnow   "
            Created = util.file_create_if_necessary(Prg,
                                                    Fname,
                                                    ContentDefault=Content)
            self.assertTrue(Created)

            RetRead, ContentReadAll = util.file_read_all(Prg, Fname)
            self.assertEqual(ContentReadAll, Content)

            ContentReadLines = util.file_read_lines(Prg, Fname, Lower=True)
            LinesWanted = ["cloud\n", "rain\n", "sun\r\n", "snow   "]
            self.assertEqual(LinesWanted, ContentReadLines)

            ContentReadLines = util.file_read_lines(Prg, Fname, Strip=True)
            LinesWanted = ["Cloud", "Rain", "Sun", "Snow"]
            self.assertEqual(LinesWanted, ContentReadLines)

            util.file_del(Fname)
Exemple #2
0
 def test_util_file_read_lines(self):
     if self._test_exec("test_file_read_lines"):
         Prg = self.Prg
         Fname = os.path.join(Prg["DirWork"], "test_file_read_lines.txt")
         util.file_write(Prg, Fname=Fname, Content="cat\ndog\nelephant")
         Lines = util.file_read_lines(Prg, Fname, Strip=True)
         self.assertEqual(Lines, ["cat", "dog", "elephant"])
         util.file_del(Fname)
Exemple #3
0
 def test_fun_pdf_to_text_converter(self):
     if self._test_exec("test_fun_pdf_to_text_converter"):
         Prg = self.Prg
         FileTxt = os.path.join(Prg["DirWork"],
                                "test_converted_from_pdf.txt")
         util.file_del(FileTxt)
         FilePdf = os.path.join(Prg["DirTestFiles"],
                                "test_pdf_conversion.pdf")
         Prg["ConverterPdfToText"](Prg, FilePdf, FileTxt)
         FileLines = util.file_read_lines(Prg, FileTxt, Strip=True)
         self.assertEqual(FileLines[0], "This is new document.")
         util.file_del(FileTxt)
Exemple #4
0
def document_obj_create_in_document_objects(Prg,
                                            DocumentObjects,
                                            ConvertedFileOrigNames_AbsPath,
                                            FileTextAbsPath,
                                            FileIndexAbsPath,
                                            FileSentencesAbsPath,
                                            WordPositionInLines=None):
    if WordPositionInLines == None:
        WordPositionInLines = dict()

    BaseNameNoExt, DotExtension = util.basename_without_extension__ext(
        FileTextAbsPath)
    if BaseNameNoExt in ConvertedFileOrigNames_AbsPath:  # I can do it with .get() but it's more descriptive
        BaseNameNoExtOrig, DotExtensionOrig = util.basename_without_extension__ext(
            FileTextAbsPath)
        FileOrig = BaseNameNoExtOrig + DotExtensionOrig
    else:
        FileOrig = BaseNameNoExt + DotExtension

    # if not Prg.get("TestExecution", False):  # during test exec hide progress
    #     info(f"{ProgressPercent} in documents dir - processed: {BaseNameNoExt}", Verbose=Verbose)
    global _DocsSampleSourceWebpages
    if not _DocsSampleSourceWebpages:
        _, _DocsSampleSourceWebpages = util_json_obj.obj_from_file(
            os.path.join(Prg["DirTextSamples"],
                         "document_samples_source_webpages.json"))

    if BaseNameNoExt not in Prg["DocumentsSourceWebpages"]:
        if BaseNameNoExt in _DocsSampleSourceWebpages["docs"]:
            DocObj = _DocsSampleSourceWebpages["docs"][BaseNameNoExt]
        else:
            DocObj = {
                "url": "url_unknown",
                "source_name": "source_unknown",
                "license": "unknown license"
            }

        util_json_obj.doc_source_webpages_update_in_Prg(
            Prg, BaseNameNoExt, DocObj)  # and reload the updated db

    DocumentObjects[BaseNameNoExt] = \
           document_obj(FileOrigPathAbs=FileOrig,  # if you use pdf/html, the original
                        FileTextAbsPath=FileTextAbsPath,  # and text files are different
                        FileIndex=FileIndexAbsPath,
                        FileSentences=FileSentencesAbsPath,
                        WordPositionInLines=WordPositionInLines,

                        # list of sentences
                        Sentences=util.file_read_lines(Prg, Fname=FileSentencesAbsPath) if isfile(FileSentencesAbsPath) else [])
def parse_user_topic(desc, encoding='utf-8'):
    logfilename = twlda_result_file('%s/TopicsDistributionOnUsers.txt' % desc)
    user_topic = []

    for line in file_read_lines(logfilename, encoding=encoding):
        data_line = line.strip().split('\t')

        if not data_line:
            continue

        data_line[0] = data_line[0][:-4]  # Remove '.txt'
        data_line[1:] = list(map(float, data_line[1:]))
        user_topic.append(data_line)

    return user_topic
Exemple #6
0
    def test_file_create_sentences__create_index(self):
        self.maxDiff = None
        if self._test_exec("test_file_create_sentences__create_index"):
            Prg = self.Prg

            FileSentences = os.path.join(Prg["DirWork"],
                                         "test_file_create_sentences.txt")
            util.file_del(FileSentences)

            Sample = 'He is my friend. "This is \n the next - city, London -- here, in London, the sky is nice." Is this the third line, or a Book about London?'

            seeker.file_sentence_create(Prg, FileSentences, Sample)
            Wanted = [
                "He is my friend. \n",  # detect London only once from this sentence:
                '"This is the next - city, London -- here, in London, the sky is nice." \n',
                "Is this the third line, or a Book about London?"
            ]

            LinesFromFile = util.file_read_lines(Prg, FileSentences)
            self.assertEqual(Wanted, LinesFromFile)

            FileIndex = os.path.join(Prg["DirWork"],
                                     "test_file_create_index.txt")
            util.file_del(FileIndex)
            seeker.file_index_create(Prg,
                                     FileIndex,
                                     FileSentences,
                                     ForcedWrite=True)
            #seeker.file_index_create(Prg, "/tmp/index.txt", FileSentences)
            # print(util.file_read_all(Prg, FileIndex))

            MultiSub = Prg["SubSentenceMultiplier"]
            MultiSubAndWord = Prg["SubSentenceMultiplier"] * Prg[
                "WordPositionMultiplier"]
            _Status, WordPosition = util_json_obj.obj_from_file(FileIndex)

            self.assertEqual(set(WordPosition["london"]),
                             set([10100, 10201, 20104]))

            util.file_del(FileSentences)
            util.file_del(FileIndex)
Exemple #7
0
def test_results_load_from_mark_detection(Prg, FileResultPathElems):
    Marks = dict()
    MarkId = 0
    MarkLines = list()
    FileResultPath = os.path.join(Prg["DirPrgParent"], *FileResultPathElems)

    print("File result path:", FileResultPath)
    for Line in util.file_read_lines(Prg, Fname=FileResultPath):
        Line = Line.strip()
        if mark_util.MarkBg not in Line and mark_util.MarkFg not in Line:
            if MarkLines:
                Marks[MarkId] = "\n".join(MarkLines)
                MarkLines = list()
                MarkId += 1
        else:
            MarkLines.append(Line)
            # print("Line: ", Line)

    if MarkLines:
        Marks[MarkId] = "\n".join(MarkLines)

    # for Key in Marks:
    #     print(Marks[Key])
    return Marks